From 22ada204ba8fd7e1ea72014171c07fd1ee10a3e4 Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 2 Oct 2024 11:55:03 -0400 Subject: [PATCH 01/35] [WIP] XRT-LITE HAL --- cmake/iree_aie_utils.cmake | 3 + iree_compiler_plugin.cmake | 3 + iree_runtime_plugin.cmake | 6 + runtime/src/iree-amd-aie/CMakeLists.txt | 4 + .../driver/xrt-lite/CMakeLists.txt | 34 ++ .../src/iree-amd-aie/driver/xrt-lite/api.h | 25 ++ .../driver/xrt-lite/cts/CMakeLists.txt | 93 ++++++ .../xrt-lite/cts/executable_cache_test.mlir | 33 ++ .../xrt_lite_command_buffer_dispatch_test.cc | 181 +++++++++++ .../iree-amd-aie/driver/xrt-lite/driver.cc | 147 +++++++++ .../driver/xrt-lite/native_executable.cc | 297 ++++++++++++++++++ .../driver/xrt-lite/native_executable.h | 44 +++ .../xrt-lite/registration/CMakeLists.txt | 24 ++ .../xrt-lite/registration/driver_module.c | 64 ++++ .../xrt-lite/registration/driver_module.h | 24 ++ .../src/iree-amd-aie/driver/xrt-lite/util.h | 22 ++ .../src/iree-amd-aie/schemas/CMakeLists.txt | 13 + .../schemas/pdi_executable_def.fbs | 57 ++++ 18 files changed, 1074 insertions(+) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/api.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/util.h create mode 100644 runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs diff --git a/cmake/iree_aie_utils.cmake b/cmake/iree_aie_utils.cmake index 2e0911dfa..aa4c57027 100644 --- a/cmake/iree_aie_utils.cmake +++ b/cmake/iree_aie_utils.cmake @@ -29,6 +29,8 @@ function(replace_string_in_file _file _match_string _replace_string) if(NOT (EXISTS ${_file})) message(FATAL_ERROR "file ${_file} does not exist") endif() + set(_lock_file "${_file}.lock") + file(LOCK "${_lock_file}" GUARD FUNCTION) file(READ "${_file}" _file_contents) if(_file_contents STREQUAL "") message(FATAL_ERROR "empty file contents for ${_file}") @@ -38,5 +40,6 @@ function(replace_string_in_file _file _match_string _replace_string) message(FATAL_ERROR "empty replacement contents for ${_file}") endif() file(WRITE "${_file}" "${_file_contents}") + file(LOCK "${_lock_file}" RELEASE) endfunction() diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index 958d6de46..874e6df00 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -22,5 +22,8 @@ if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) endif() include(iree_aie_bootgen) +set(IREE_TARGET_BACKEND_XRT_LITE ON CACHE BOOL "") +set(IREE_TARGET_BACKEND_XRT ON CACHE BOOL "") + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/compiler/plugins/target/AMD-AIE target/AMD-AIE) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/compiler/plugins/preprocessing/XDNA-OPLIB preprocessing/XDNA-OPLIB) diff --git a/iree_runtime_plugin.cmake b/iree_runtime_plugin.cmake index 15a4d07da..3b168ebc3 100644 --- a/iree_runtime_plugin.cmake +++ b/iree_runtime_plugin.cmake @@ -26,5 +26,11 @@ if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) include(iree_aie_bootgen) endif() +set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER OFF) +if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) + message(STATUS "Enabling XRT-LITE build because it is an enabled HAL driver") + set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) +endif() + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/runtime/src AMD-AIE) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/experimental AMD-AIE-experimental) diff --git a/runtime/src/iree-amd-aie/CMakeLists.txt b/runtime/src/iree-amd-aie/CMakeLists.txt index bfa015081..d861c846d 100644 --- a/runtime/src/iree-amd-aie/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/CMakeLists.txt @@ -8,6 +8,10 @@ if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) add_subdirectory(driver/xrt) endif() +if(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) + add_subdirectory(driver/xrt-lite) +endif() + # Flatbuffer schema generation does not require XRT. Moreover the generated # flatbuffer header files are used by the compiler to create artefacts # (.vmfb file), and so the schema sub-directory is required even when not diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt new file mode 100644 index 000000000..56f3e5b68 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_add_all_subdirs() + +iree_register_external_hal_driver( + NAME + xrt-lite + DRIVER_TARGET + iree-amd-aie::driver::xrt-lite::registration + REGISTER_FN + iree_hal_xrt_lite_driver_module_register +) + +iree_cc_library( + NAME + xrt-lite + SRCS + api.h + native_executable.h + native_executable.cc + driver.cc + util.h + DEPS + iree::base + iree::base::core_headers + iree::base::internal::flatcc::parsing + iree-amd-aie::schemas::pdi_executable_def_c_fbs + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h new file mode 100644 index 000000000..8e3b00649 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -0,0 +1,25 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( + iree_string_view_t identifier, iree_allocator_t host_allocator, + iree_hal_driver_t** out_driver); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt new file mode 100644 index 000000000..28467cb90 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -0,0 +1,93 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +include(CMakeDependentOption) + +set(PEANO_INSTALL_DIR "" CACHE PATH "") +set(VITIS_DIR "" CACHE PATH "") +if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) + message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") +endif() +cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") +set(TARGET_DEVICE "npu1_4col" CACHE STRING "") + +iree_bytecode_module( + NAME + xrt_lite_executable_cache_test_module + MODULE_FILE_NAME + xrt_lite_executable_cache_test.bin + SRC + "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" + FLAGS + --compile-mode=hal-executable + --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} + --iree-hal-target-backends=xrt-lite + --iree-amdaie-lower-to-aie-pipeline=air + --iree-amdaie-target-device=${TARGET_DEVICE} + --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} + --iree-amd-aie-vitis-install-dir=${VITIS_DIR} + --iree-amd-aie-enable-chess=$ + --iree-amd-aie-show-invoked-commands + PUBLIC + TESTONLY +) + +iree_c_embed_data( + NAME + xrt_lite_executables_c + SRCS + xrt_lite_executable_cache_test.bin + C_FILE_OUTPUT + xrt_lite_executables_c.c + H_FILE_OUTPUT + xrt_lite_executables_c.h + IDENTIFIER + iree_cts_testdata_executables_aie_xrt_lite + STRIP_PREFIX + xrt_lite_ + DEPENDS + ::xrt_lite_executable_cache_test_module + FLATTEN + PUBLIC + TESTONLY +) + +iree_hal_cts_test_suite( + DRIVER_NAME + xrt-lite + DRIVER_REGISTRATION_HDR + "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" + DRIVER_REGISTRATION_FN + "iree_hal_xrt_lite_driver_module_register" + COMPILER_TARGET_BACKEND + "xrt-lite" + EXECUTABLE_FORMAT + "\"amdaie-pdi-fb\"" + DEPS + iree-amd-aie::driver::xrt-lite::registration + INCLUDED_TESTS + "allocator" + "buffer_mapping" + "command_buffer" + "driver" +) + +iree_cc_test( + NAME + xrt_lite_command_buffer_dispatch_test + SRCS + xrt_lite_command_buffer_dispatch_test.cc + DEPS + ::xrt_lite_executables_c + iree-amd-aie::driver::xrt-lite::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main + iree::tools::testing::e2e::e2e_test_util +) + +target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_command_buffer_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir new file mode 100644 index 000000000..4a27d79e0 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir @@ -0,0 +1,33 @@ +// bootstrapped from https://github.com/nod-ai/iree-amd-aie/blob/9c4c167baf89a279888fba8db75907845946077c/tests/samples/matmul_pack_peel_objectfifo_e2e.mlir + +#pipeline_layout = #hal.pipeline.layout< + bindings = [ + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding + ], + flags = Indirect +> +hal.executable.source public @amdaie_fb { + hal.executable.export public @matmul_f32_dispatch_0_matmul_256x256x32_f32 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_f32_dispatch_0_matmul_256x256x32_f32() { + %c0_f32 = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf32> + %5 = tensor.empty() : tensor<256x256xf32> + %6 = linalg.fill ins(%c0_f32 : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor> + return + } + } +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc new file mode 100644 index 000000000..fbe4c0720 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc @@ -0,0 +1,181 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/buffer_view_util.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "tools/testing/e2e/test_utils.h" +#include "xrt_lite_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt-lite"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_lite_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-pdi-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_lite_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} +class CommandBufferDispatchTest + : public CTSTestBase<::testing::TestWithParam> { + protected: + void PrepareMatmulExecutable() { + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status_), &executable_cache_)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("xrt-lite_executable_cache_test.bin")); + + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache_, &executable_params, &executable_)); + } + + void CleanupExecutable() { + iree_hal_executable_release(executable_); + iree_hal_executable_cache_release(executable_cache_); + IREE_ASSERT_OK(loop_status_); + } + + iree_status_t loop_status_ = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache_ = nullptr; + iree_hal_executable_t* executable_ = nullptr; +}; + +int32_t generate_random_number(iree_hal_element_type_t element_type, + int32_t seed) { + int32_t min = 0; + int32_t max = 0; + iree_test_utils_get_min_max_for_element_type(element_type, &min, &max); + uint32_t range = (max - min + 1); + return (int32_t)iree_test_utils_pseudorandom_range( + reinterpret_cast(&seed), range) + + min; +} + +TEST_P(CommandBufferDispatchTest, DispatchMatmul) { + PrepareMatmulExecutable(); + + // Create input buffer. + constexpr iree_device_size_t WIDTH = 256; + constexpr iree_device_size_t M = WIDTH, K = WIDTH, N = WIDTH; + iree_hal_buffer_t *input_A = nullptr, *input_B = nullptr, *output_C = nullptr; + int32_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count() >> + 32; + int32_t a = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed); + int32_t b = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed + 1); + CreateFilledDeviceBuffer(M * K * sizeof(float), a, &input_A); + CreateFilledDeviceBuffer(K * N * sizeof(float), b, &input_B); + CreateFilledDeviceBuffer(M * N * sizeof(float), 0, &output_C); + + iree_hal_buffer_ref_t binding_refs[3]; + iree_hal_buffer_binding_table_t binding_table = + iree_hal_buffer_binding_table_empty(); + binding_refs[0] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_A, + /*offset=*/0, + /*length=*/M * K * sizeof(float), + }; + binding_refs[1] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_B, + /*offset=*/0, + /*length=*/K * N * sizeof(float), + }; + binding_refs[2] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/output_C, + /*offset=*/0, + /*length=*/M * N * sizeof(float), + }; + iree_hal_buffer_ref_list_t bindings = { + /*.count=*/IREE_ARRAYSIZE(binding_refs), + /*.values=*/binding_refs, + }; + + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + binding_table.count, &command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + + uint32_t workgroup_count[3] = {1, 1, 1}; + IREE_ASSERT_OK(iree_hal_command_buffer_dispatch( + command_buffer, executable_, /*entry_point=*/0, workgroup_count, + iree_const_byte_span_empty(), bindings, IREE_HAL_DISPATCH_FLAG_NONE)); + + IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier( + command_buffer, + /*source_stage_mask=*/IREE_HAL_EXECUTION_STAGE_DISPATCH | + IREE_HAL_EXECUTION_STAGE_TRANSFER | + IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, + /*target_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE | + IREE_HAL_EXECUTION_STAGE_DISPATCH | IREE_HAL_EXECUTION_STAGE_TRANSFER, + IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0, + /*memory_barriers=*/nullptr, + /*buffer_barrier_count=*/0, /*buffer_barriers=*/nullptr)); + + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer, binding_table)); + + std::vector output_values; + output_values.reserve(M * N); + IREE_ASSERT_OK(iree_hal_device_transfer_d2h( + device_, output_C, + /*source_offset=*/0, output_values.data(), M * N * sizeof(float), + IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout())); + std::vector correct_output_values; + correct_output_values.reserve(M * N); + std::fill_n(correct_output_values.data(), M * N, (float)WIDTH * (a * b)); + int n_wrong = 0; + for (int i = 0; i < M * N; ++i) { + if (output_values[i] != correct_output_values[i]) { + std::cout << "wrong @ i:" << i << ", " << output_values[i] + << " != " << correct_output_values[i] << "\n"; + n_wrong += 1; + } + } + EXPECT_EQ(n_wrong, 0); + + iree_hal_command_buffer_release(command_buffer); + iree_hal_buffer_release(output_C); + iree_hal_buffer_release(input_B); + iree_hal_buffer_release(input_A); + CleanupExecutable(); +} + +INSTANTIATE_TEST_SUITE_P(CommandBufferTest, CommandBufferDispatchTest, + ::testing::Values(RecordingType::kDirect), + GenerateTestName()); + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc new file mode 100644 index 000000000..1e0d06c12 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -0,0 +1,147 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/base/api.h" +#include "iree/base/tracing.h" +#include "iree/hal/api.h" +#include "util.h" + +// Maximum device path length we support. The path is always a 16 character hex +// string. +#define IREE_HAL_XRT_LITE_MAX_DEVICE_PATH_LENGTH 32 +// Maximum device name length we support. +#define IREE_HAL_XRT_LITE_MAX_DEVICE_NAME_LENGTH 64 + +struct iree_hal_xrt_lite_driver_t { + // Abstract resource used for injecting reference counting and vtable; must be + // at offset 0. + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + iree_string_view_t identifier; + uint64_t device_hdl; +}; + +static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { + iree_hal_xrt_lite_driver_t* driver = + reinterpret_cast(base_driver); + iree_allocator_t host_allocator = driver->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(host_allocator, driver); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( + iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, + iree_host_size_t* out_device_info_count, + iree_hal_device_info_t** out_device_infos) { + iree_hal_xrt_lite_driver_t* driver = + reinterpret_cast(base_driver); + uint64_t device_hdl = driver->device_hdl; + // Allocate the return infos and populate with the devices. + iree_hal_device_info_t* device_infos = nullptr; + iree_host_size_t single_info_size = + sizeof(iree_hal_device_info_t) + + (IREE_HAL_XRT_LITE_MAX_DEVICE_PATH_LENGTH + + IREE_HAL_XRT_LITE_MAX_DEVICE_NAME_LENGTH) * + sizeof(char); + IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, single_info_size, + (void**)&device_infos)); + + uint8_t* buffer_ptr = (uint8_t*)device_infos + sizeof(iree_hal_device_info_t); + memset(device_infos, 0, sizeof(*device_infos)); + + // device_infos->device_id = 0; + // std::string device_name = "aie2"; + // const size_t name_len = strlen(device_name.c_str()); + // if (name_len >= IREE_HAL_XRT_LITE_MAX_DEVICE_NAME_LENGTH) { + // return iree_make_status(IREE_STATUS_OUT_OF_RANGE, + // "device name out of range"); + // } + // buffer_ptr += iree_string_view_append_to_buffer( + // iree_make_string_view(device_name.c_str(), name_len), + // &device_infos->name, (char*)buffer_ptr); + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED); + + *out_device_info_count = 1; + *out_device_infos = device_infos; + return status; +} + +static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( + iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, + iree_host_size_t param_count, const iree_string_pair_t* params, + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_driver_t* driver = + reinterpret_cast(base_driver); + iree_string_view_t device_name = iree_make_cstring_view("xrt-lite"); + + // iree_status_t status = iree_hal_xrt_lite_device_create( + // device_name, &driver->device_params, host_allocator, out_device); + + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED); + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( + iree_hal_driver_t* base_driver, iree_string_view_t driver_name, + iree_string_view_t device_path, iree_host_size_t param_count, + const iree_string_pair_t* params, iree_allocator_t host_allocator, + iree_hal_device_t** out_device) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_driver_t* driver = + reinterpret_cast(base_driver); + iree_string_view_t device_name = iree_make_cstring_view("xrt"); + + // iree_status_t status = iree_hal_xrt_lite_device_create( + // device_name, &driver->device_params, host_allocator, out_device); + + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED); + + IREE_TRACE_ZONE_END(z0); + return status; +} + +namespace { +const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable = { + /*.destroy = */ iree_hal_xrt_lite_driver_destroy, + /*.query_available_devices = */ + iree_hal_xrt_lite_driver_query_available_devices, + /*.dump_device_info = */ unimplemented, + /*.create_device_by_id = */ iree_hal_xrt_lite_driver_create_device_by_id, + /*.create_device_by_path = */ + iree_hal_xrt_lite_driver_create_device_by_path, +}; +} // namespace + +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( + iree_string_view_t identifier, iree_allocator_t host_allocator, + iree_hal_driver_t** out_driver) { + IREE_ASSERT_ARGUMENT(out_driver); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_driver_t* driver = nullptr; + iree_host_size_t total_size = iree_sizeof_struct(*driver) + identifier.size; + IREE_RETURN_IF_ERROR( + iree_allocator_malloc(host_allocator, total_size, (void**)&driver)); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_driver_vtable, + &driver->resource); + + driver->host_allocator = host_allocator; + iree_string_view_append_to_buffer( + identifier, &driver->identifier, + (char*)driver + iree_sizeof_struct(*driver)); + + *out_driver = reinterpret_cast(driver); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc new file mode 100644 index 000000000..12b1e614c --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc @@ -0,0 +1,297 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/native_executable.h" + +#include +#include + +#include "iree-amd-aie/schemas/pdi_executable_def_reader.h" +#include "iree-amd-aie/schemas/pdi_executable_def_verifier.h" +#include "iree/base/api.h" + +struct iree_hal_xrt_lite_native_executable_t { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + iree_host_size_t entry_point_count; + iree_hal_xrt_lite_kernel_info_t entry_points[]; +}; + +namespace { +extern const iree_hal_executable_vtable_t + iree_hal_xrt_lite_native_executable_vtable; +} + +static iree_hal_xrt_lite_native_executable_t* +iree_hal_xrt_lite_native_executable_cast(iree_hal_executable_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); + return reinterpret_cast(base_value); +} + +static iree_status_t iree_hal_xrt_lite_native_executable_flatbuffer_verify( + iree_const_byte_span_t flatbuffer_data) { + if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "flatbuffer data is not present or less than 16 bytes (%zu total)", + flatbuffer_data.data_length); + } + + int verify_ret = iree_amd_aie_hal_xrt_lite_ExecutableDef_verify_as_root( + flatbuffer_data.data, flatbuffer_data.data_length); + if (verify_ret != flatcc_verify_ok) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "flatbuffer verification failed: %s", + flatcc_verify_error_string(verify_ret)); + } + + iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root(flatbuffer_data.data); + + flatbuffers_string_vec_t entry_points_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); + size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); + if (entry_point_count == 0) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "no entry points found in the executable"); + } + + for (size_t i = 0; i < entry_point_count; ++i) { + if (!flatbuffers_string_len( + flatbuffers_string_vec_at(entry_points_vec, i))) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "executable entry point %zu has no name", i); + } + } + + iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); + size_t number_pdi = iree_amd_aie_hal_xrt_lite_PdiDef_vec_len(pdis); + if (number_pdi == 0) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no pdi present"); + } + + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instr = + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); + size_t number_asm_instr = + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_len(asm_instr); + if (number_asm_instr != entry_point_count) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "number of entry points (%zu) and number of asm " + "instructions (%zu) mismatched", + entry_point_count, number_asm_instr); + } + + return iree_ok_status(); +} + +// iree_status_t iree_hal_xrt_lite_native_executable_create( +// const iree_hal_executable_params_t* executable_params, +// iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator, +// iree_hal_executable_t** out_executable) { +// IREE_ASSERT_ARGUMENT(device_allocator); +// IREE_ASSERT_ARGUMENT(executable_params); +// IREE_ASSERT_ARGUMENT(out_executable); +// +// IREE_TRACE_ZONE_BEGIN(z0); +// +// *out_executable = nullptr; +// iree_hal_xrt_lite_native_executable_t* executable = nullptr; +// +// IREE_RETURN_AND_END_ZONE_IF_ERROR( +// z0, iree_hal_xrt_lite_native_executable_flatbuffer_verify( +// executable_params->executable_data)); +// +// iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root( +// executable_params->executable_data.data); +// +// flatbuffers_uint32_vec_t pdi_indices_vec = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_pdi_indices_get(executable_def); +// +// flatbuffers_uint32_vec_t asm_instr_indices_vec = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instr_indices_get( +// executable_def); +// +// flatbuffers_string_vec_t entry_points_vec = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); +// +// iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis_vec = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); +// +// iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instrs_vec = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); +// +// iree_host_size_t entry_point_count = +// flatbuffers_string_vec_len(entry_points_vec); +// +// iree_host_size_t total_entry_point_name_chars = 0; +// IREE_TRACE({ +// for (iree_host_size_t entry_ordinal = 0; entry_ordinal < +// entry_point_count; +// entry_ordinal++) { +// const char* entry_name = +// flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); +// total_entry_point_name_chars += flatbuffers_string_len(entry_name); +// } +// }); +// +// iree_host_size_t total_size = +// sizeof(*executable) + +// entry_point_count * sizeof(executable->entry_points[0]) + +// total_entry_point_name_chars; +// IREE_RETURN_AND_END_ZONE_IF_ERROR( +// z0, iree_allocator_malloc(host_allocator, total_size, +// reinterpret_cast(&executable))); +// IREE_TRACE( +// char* string_table_buffer = +// (char*)((char*)executable + sizeof(*executable) + +// entry_point_count * sizeof(executable->entry_points[0]))); +// +// iree_hal_resource_initialize(&iree_hal_xrt_lite_native_executable_vtable, +// &executable->resource); +// +// executable->host_allocator = host_allocator; +// executable->entry_point_count = entry_point_count; +// +// for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; +// entry_ordinal++) { +// const char* entry_name = +// flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); +// uint32_t pdi_index = +// flatbuffers_uint32_vec_at(pdi_indices_vec, entry_ordinal); +// iree_amd_aie_hal_xrt_lite_PdiDef_table_t pdi_def = +// iree_amd_aie_hal_xrt_lite_PdiDef_vec_at(pdis_vec, pdi_index); +// flatbuffers_string_t pdi_fb = +// iree_amd_aie_hal_xrt_lite_PdiDef_pdi_get(pdi_def); +// uint32_t num_pdi_chars = flatbuffers_string_len(pdi_fb); +// uint32_t asm_instr_index = +// flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); +// iree_amd_aie_hal_xrt_lite_AsmInstDef_table_t asminst_def = +// iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_at(asm_instrs_vec, +// asm_instr_index); +// flatbuffers_uint32_vec_t asm_inst = +// iree_amd_aie_hal_xrt_lite_AsmInstDef_asm_inst_get(asminst_def); +// uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); +// +// iree_hal_xrt_lite_allocator_t* allocator = +// iree_hal_xrt_lite_allocator_cast(device_allocator); +// iree_hal_xrt_lite_kernel_info_t* params = +// &executable->entry_points[entry_ordinal]; +// params->num_instr = num_instr; +// // Load the IPU and PDI files into a global pool that doesn't support +// kernel +// // args (DEV BO). +// IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( +// z0, symbols, +// xrt_lite_amd_memory_pool_allocate( +// allocator->global_dev_mem_pool, num_instr * sizeof(uint32_t), 0, +// reinterpret_cast(¶ms->ipu_inst_buf)), +// "xrt_lite_amd_memory_pool_allocate"); +// std::memcpy(params->ipu_inst_buf, asm_inst, num_instr * +// sizeof(uint32_t)); IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( +// z0, symbols, +// xrt_lite_amd_get_handle_from_vaddr(params->ipu_inst_buf, +// ¶ms->ipu_inst_handle), +// "xrt_lite_amd_agent_iterate_memory_pools"); +// IREE_ASSERT(params->ipu_inst_handle); +// +// IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( +// z0, symbols, +// xrt_lite_amd_memory_pool_allocate( +// allocator->global_dev_mem_pool, num_pdi_chars, 0, +// reinterpret_cast(¶ms->pdi_buf)), +// "xrt_lite_amd_memory_pool_allocate"); +// std::memcpy(params->pdi_buf, pdi_fb, num_pdi_chars * sizeof(char)); +// IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( +// z0, symbols, +// xrt_lite_amd_get_handle_from_vaddr(params->pdi_buf, +// ¶ms->pdi_handle), +// "xrt_lite_amd_agent_iterate_memory_pools"); +// IREE_ASSERT(params->pdi_handle); +// +// (void)entry_name; +// IREE_TRACE({ +// iree_host_size_t entry_name_length = +// flatbuffers_string_len(entry_name); memcpy(string_table_buffer, +// entry_name, entry_name_length); string_table_buffer += +// entry_name_length; +// }); +// +// IREE_TRACE({ +// if +// (iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_is_present( +// executable_def)) { +// iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_t source_locs_vec = +// iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_get( +// executable_def); +// iree_amd_aie_hal_xrt_lite_FileLineLocDef_table_t source_loc = +// iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_at(source_locs_vec, +// entry_ordinal); +// flatbuffers_string_t filename = +// iree_amd_aie_hal_xrt_lite_FileLineLocDef_filename_get(source_loc); +// uint32_t line = +// iree_amd_aie_hal_xrt_lite_FileLineLocDef_line_get(source_loc); +// params->source_filename = +// iree_make_string_view(filename, +// flatbuffers_string_len(filename)); +// params->source_line = line; +// } +// }); +// } +// +// iree_status_t status = iree_ok_status(); +// +// if (iree_status_is_ok(status)) { +// *out_executable = reinterpret_cast(executable); +// } else { +// iree_hal_executable_destroy( +// reinterpret_cast(executable)); +// } +// +// IREE_TRACE_ZONE_END(z0); +// return status; +// } + +static void iree_hal_xrt_lite_native_executable_destroy( + iree_hal_executable_t* base_executable) { + iree_hal_xrt_lite_native_executable_t* executable = + iree_hal_xrt_lite_native_executable_cast(base_executable); + iree_allocator_t host_allocator = executable->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + for (iree_host_size_t entry_ordinal = 0; + entry_ordinal < executable->entry_point_count; entry_ordinal++) { + iree_hal_xrt_lite_kernel_info_t* params = + &executable->entry_points[entry_ordinal]; + } + + iree_allocator_free(host_allocator, executable); + + IREE_TRACE_ZONE_END(z0); +} + +iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_info( + iree_hal_executable_t* base_executable, int32_t entry_point, + iree_hal_xrt_lite_kernel_info_t* out_info) { + iree_hal_xrt_lite_native_executable_t* executable = + iree_hal_xrt_lite_native_executable_cast(base_executable); + if (entry_point >= executable->entry_point_count) { + return iree_make_status(IREE_STATUS_OUT_OF_RANGE, + "entry point ordinal %d out of range; executable " + "only contains %ld entry points", + entry_point, executable->entry_point_count); + } + memcpy(out_info, &executable->entry_points[entry_point], sizeof(*out_info)); + return iree_ok_status(); +} + +namespace { +const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable = + { + /*destroy=*/iree_hal_xrt_lite_native_executable_destroy, +}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h new file mode 100644 index 000000000..c2bc45b1c --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h @@ -0,0 +1,44 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ + +#include + +#include "iree/base/api.h" +#include "iree/base/tracing.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct iree_hal_xrt_lite_kernel_info_t { + uint32_t ipu_inst_handle; + uint32_t pdi_handle; + uint32_t* ipu_inst_buf; + char* pdi_buf; + uint32_t num_instr; + IREE_TRACE(iree_string_view_t function_name;) + IREE_TRACE(iree_string_view_t source_filename;) + IREE_TRACE(uint32_t source_line;) +}; + +iree_status_t iree_hal_xrt_lite_native_executable_create( + const iree_hal_executable_params_t* executable_params, + iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator, + iree_hal_executable_t** out_executable); + +iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_info( + iree_hal_executable_t* executable, int32_t entry_point, + iree_hal_xrt_lite_kernel_info_t* out_info); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt new file mode 100644 index 000000000..f387ade2a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_cc_library( + NAME + registration + HDRS + driver_module.h + SRCS + driver_module.c + DEPS + iree::base + iree::base::core_headers + iree-amd-aie::driver::xrt-lite + iree::hal + DEFINES + "IREE_HAVE_HAL_XRT_LITE_DRIVER_MODULE=1" + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c new file mode 100644 index 000000000..f50b2ac68 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -0,0 +1,64 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" + +#include +#include + +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree/base/api.h" +#include "iree/base/status.h" + +static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( + void* self, iree_host_size_t* out_driver_info_count, + const iree_hal_driver_info_t** out_driver_infos) { + IREE_ASSERT_ARGUMENT(out_driver_info_count); + IREE_ASSERT_ARGUMENT(out_driver_infos); + IREE_TRACE_ZONE_BEGIN(z0); + + static const iree_hal_driver_info_t driver_infos[1] = {{ + .driver_name = IREE_SVL("xrt-lite"), + .full_name = IREE_SVL("XRT-LITE driver (for AIE)"), + }}; + *out_driver_info_count = IREE_ARRAYSIZE(driver_infos); + *out_driver_infos = driver_infos; + + IREE_TRACE_ZONE_END(z0); + + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( + void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator, + iree_hal_driver_t** out_driver) { + IREE_ASSERT_ARGUMENT(out_driver); + + if (!iree_string_view_equal(driver_name, IREE_SV("xrt-lite"))) { + return iree_make_status(IREE_STATUS_UNAVAILABLE, + "no driver '%.*s' is provided by this factory", + (int)driver_name.size, driver_name.data); + } + + IREE_TRACE_ZONE_BEGIN(z0); + + iree_status_t status = + iree_hal_xrt_lite_driver_create(driver_name, host_allocator, out_driver); + + IREE_TRACE_ZONE_END(z0); + + return status; +} + +IREE_API_EXPORT iree_status_t +iree_hal_xrt_lite_driver_module_register(iree_hal_driver_registry_t* registry) { + static const iree_hal_driver_factory_t factory = { + .self = NULL, + .enumerate = iree_hal_xrt_lite_driver_factory_enumerate, + .try_create = iree_hal_xrt_lite_driver_factory_try_create, + }; + return iree_hal_driver_registry_register_factory(registry, &factory); +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h new file mode 100644 index 000000000..5b42d7ad3 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h @@ -0,0 +1,24 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +IREE_API_EXPORT iree_status_t +iree_hal_xrt_lite_driver_module_register(iree_hal_driver_registry_t* registry); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h new file mode 100644 index 000000000..70b4c88c0 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -0,0 +1,22 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H +#define IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H + +#include "iree/base/status.h" + +template +iree_status_t unimplemented(Params...) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unimplemented"); +} + +template +void unimplemented(Params...) { + IREE_ASSERT(false && "unimplemented"); +} + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H diff --git a/runtime/src/iree-amd-aie/schemas/CMakeLists.txt b/runtime/src/iree-amd-aie/schemas/CMakeLists.txt index 48c2885fc..15c818aff 100644 --- a/runtime/src/iree-amd-aie/schemas/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/schemas/CMakeLists.txt @@ -12,3 +12,16 @@ flatbuffer_c_library( "--json" PUBLIC ) + +flatbuffer_c_library( + NAME + pdi_executable_def_c_fbs + SRCS + "pdi_executable_def.fbs" + FLATCC_ARGS + "--reader" + "--builder" + "--verifier" + "--json" + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs b/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs new file mode 100644 index 000000000..8d4e49c13 --- /dev/null +++ b/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs @@ -0,0 +1,57 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +namespace iree.amd.aie.hal.xrt_lite; + +file_identifier "PDIR"; +file_extension "pdir"; + +// Source code location denoted by a file name and line within that file. +table FileLineLocDef { + filename:string; + line:int32; +} + +// Assembly instructions. +table AsmInstDef { + asm_inst:[uint32]; +} + +// PDIs. +table PdiDef { + pdi:string; +} + +table ExecutableDef { + // A map of entry point ordinals to string names as used in PDI(s) + entry_points:[string]; + + // A map of entry point ordinals to the indices of the containing XCLBINs (the following field). + // This list has the same size as the entry_points list. + // This list currently is just a range (0, number of entry points] but will change when we start doing + // kernel merging in the backend. + pdi_indices:[uint32]; + + + // PDI strings of the entry points. + pdis: [PdiDef]; + + // A map of entry point ordinals to the indices of the containing asm_instrs (the following field). + // This list has the same size as the entry_points list. + // This list currently is just a range (0, number of entry points] but can chnage if kernels decide to + // share the instruction streams. + asm_instr_indices:[uint32]; + + // Assembly instructions stream for LX6 processor to run for each kernel + // The number of kernels and by extention the number of asm instruction streams + // are equal to the number of entry points. We access each kernel + // by giving the entry point name to the pdi and getting a kernel object from it. + asm_instrs:[AsmInstDef]; + + source_locations:[FileLineLocDef]; +} + +root_type ExecutableDef; From 5a17ba00f4d6fef95395047701c0300184a8928c Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 2 Oct 2024 19:00:22 -0400 Subject: [PATCH 02/35] add linux kmq shim (doesn't build) --- iree_compiler_plugin.cmake | 3 - .../driver/xrt-lite/CMakeLists.txt | 1 + .../driver/xrt-lite/cts/CMakeLists.txt | 10 +- .../driver/xrt-lite/shim/CMakeLists.txt | 10 + .../driver/xrt-lite/shim/linux/CMakeLists.txt | 8 + .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 31 + .../xrt-lite/shim/linux/kmq/amdxdna_accel.h | 591 +++++++++ .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 443 +++++++ .../driver/xrt-lite/shim/linux/kmq/bo.h | 191 +++ .../driver/xrt-lite/shim/linux/kmq/device.cpp | 119 ++ .../driver/xrt-lite/shim/linux/kmq/device.h | 86 ++ .../driver/xrt-lite/shim/linux/kmq/ert.h | 1176 +++++++++++++++++ .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 231 ++++ .../driver/xrt-lite/shim/linux/kmq/fence.h | 62 + .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 151 +++ .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 88 ++ .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 88 ++ .../driver/xrt-lite/shim/linux/kmq/hwq.h | 43 + .../driver/xrt-lite/shim/linux/kmq/pcidev.cpp | 444 +++++++ .../driver/xrt-lite/shim/linux/kmq/pcidev.h | 86 ++ .../driver/xrt-lite/shim/linux/kmq/pcidrv.cpp | 49 + .../driver/xrt-lite/shim/linux/kmq/pcidrv.h | 24 + .../driver/xrt-lite/shim/linux/kmq/shared.h | 24 + .../xrt-lite/shim/linux/kmq/shim_debug.cpp | 35 + .../xrt-lite/shim/linux/kmq/shim_debug.h | 57 + 25 files changed, 4043 insertions(+), 8 deletions(-) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index 874e6df00..958d6de46 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -22,8 +22,5 @@ if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) endif() include(iree_aie_bootgen) -set(IREE_TARGET_BACKEND_XRT_LITE ON CACHE BOOL "") -set(IREE_TARGET_BACKEND_XRT ON CACHE BOOL "") - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/compiler/plugins/target/AMD-AIE target/AMD-AIE) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/compiler/plugins/preprocessing/XDNA-OPLIB preprocessing/XDNA-OPLIB) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index 56f3e5b68..248412689 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -30,5 +30,6 @@ iree_cc_library( iree::base::core_headers iree::base::internal::flatcc::parsing iree-amd-aie::schemas::pdi_executable_def_c_fbs + iree-amd-aie::driver::xrt-lite::shim::linux::kmq::kmq PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index 28467cb90..e210f00a1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -24,7 +24,7 @@ iree_bytecode_module( FLAGS --compile-mode=hal-executable --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} - --iree-hal-target-backends=xrt-lite + --iree-hal-target-backends=amd-aie --iree-amdaie-lower-to-aie-pipeline=air --iree-amdaie-target-device=${TARGET_DEVICE} --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} @@ -63,15 +63,15 @@ iree_hal_cts_test_suite( DRIVER_REGISTRATION_FN "iree_hal_xrt_lite_driver_module_register" COMPILER_TARGET_BACKEND - "xrt-lite" + "amd-aie" EXECUTABLE_FORMAT "\"amdaie-pdi-fb\"" DEPS iree-amd-aie::driver::xrt-lite::registration INCLUDED_TESTS - "allocator" - "buffer_mapping" - "command_buffer" +# "allocator" +# "buffer_mapping" +# "command_buffer" "driver" ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt new file mode 100644 index 000000000..ac1522216 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if(UNIX) + add_subdirectory(linux) +endif() \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt new file mode 100644 index 000000000..c4e1e5604 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_add_all_subdirs() diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt new file mode 100644 index 000000000..4cccb9548 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_cc_library( + NAME + kmq + SRCS + amdxdna_accel.h + bo.cpp + bo.h + device.cpp + device.h + fence.cpp + fence.h + hwctx.cpp + hwctx.h + hwq.cpp + hwq.h + pcidev.cpp + pcidev.h + pcidrv.cpp + pcidrv.h + shared.h + shim_debug.cpp + shim_debug.h + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h new file mode 100644 index 000000000..e7f52afc3 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h @@ -0,0 +1,591 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. + */ + +#ifndef AMDXDNA_ACCEL_H_ +#define AMDXDNA_ACCEL_H_ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#define AMDXDNA_DRIVER_MAJOR 1 +#define AMDXDNA_DRIVER_MINOR 0 + +#define AMDXDNA_INVALID_ADDR (~0UL) +#define AMDXDNA_INVALID_CTX_HANDLE 0 +#define AMDXDNA_INVALID_BO_HANDLE 0 +#define AMDXDNA_INVALID_FENCE_HANDLE 0 + +/* + * The interface can grow/extend over time. + * On each struct amdxdna_drm_*, to support potential extension, we defined it + * like this. + * + * Example code: + * + * struct amdxdna_drm_example_data { + * .ext = (uintptr_t)&example_data_ext; + * ... + * }; + * + * We don't have extension now. The extension struct will define in the future. + */ + +enum amdxdna_drm_ioctl_id { + DRM_AMDXDNA_CREATE_HWCTX, + DRM_AMDXDNA_DESTROY_HWCTX, + DRM_AMDXDNA_CONFIG_HWCTX, + DRM_AMDXDNA_CREATE_BO, + DRM_AMDXDNA_GET_BO_INFO, + DRM_AMDXDNA_SYNC_BO, + DRM_AMDXDNA_EXEC_CMD, + DRM_AMDXDNA_WAIT_CMD, + DRM_AMDXDNA_GET_INFO, + DRM_AMDXDNA_SET_STATE, + DRM_AMDXDNA_NUM_IOCTLS +}; + +enum amdxdna_device_type { + AMDXDNA_DEV_TYPE_UNKNOWN = -1, + AMDXDNA_DEV_TYPE_KMQ, + AMDXDNA_DEV_TYPE_UMQ, +}; + +/** + * struct qos_info - QoS information for driver. + * @gops: Giga operations per second. + * @fps: Frames per second. + * @dma_bandwidth: DMA bandwidtha. + * @latency: Frame response latency. + * @frame_exec_time: Frame execution time. + * @priority: Request priority. + * + * User program can provide QoS hints to driver. + */ +struct amdxdna_qos_info { + __u32 gops; + __u32 fps; + __u32 dma_bandwidth; + __u32 latency; + __u32 frame_exec_time; + __u32 priority; +}; + +/** + * struct amdxdna_drm_create_hwctx - Create hardware context. + * @ext: MBZ. + * @ext_flags: MBZ. + * @qos_p: Address of QoS info. + * @umq_bo: BO handle for user mode queue(UMQ). + * @log_buf_bo: BO handle for log buffer. + * @max_opc: Maximum operations per cycle. + * @num_tiles: Number of AIE tiles. + * @mem_size: Size of AIE tile memory. + * @umq_doorbell: Returned offset of doorbell associated with UMQ. + * @handle: Returned hardware context handle. + */ +struct amdxdna_drm_create_hwctx { + __u64 ext; + __u64 ext_flags; + __u64 qos_p; + __u32 umq_bo; + __u32 log_buf_bo; + __u32 max_opc; + __u32 num_tiles; + __u32 mem_size; + __u32 umq_doorbell; + __u32 handle; +}; + +/** + * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. + * @handle: Hardware context handle. + * @pad: MBZ. + */ +struct amdxdna_drm_destroy_hwctx { + __u32 handle; + __u32 pad; +}; + +/** + * struct amdxdna_cu_config - configuration for one CU + * @cu_bo: CU configuration buffer bo handle + * @cu_func: Functional of a CU + * @pad: MBZ + */ +struct amdxdna_cu_config { + __u32 cu_bo; + __u8 cu_func; + __u8 pad[3]; +}; + +/** + * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware + * context + * @num_cus: Number of CUs to configure + * @pad: MBZ + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + */ +struct amdxdna_hwctx_param_config_cu { + __u16 num_cus; + __u16 pad[3]; + struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); +}; + +enum amdxdna_drm_config_hwctx_param { + DRM_AMDXDNA_HWCTX_CONFIG_CU, + DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + DRM_AMDXDNA_HWCTX_CONFIG_NUM +}; + +/** + * struct amdxdna_drm_config_hwctx - Configure hardware context. + * @handle: hardware context handle. + * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the + * structure passed in via param_val. + * @param_val: A structure specified by the param_type struct member. + * @param_val_size: Size of the parameter buffer pointed to by the param_val. + * If param_val is not a pointer, driver can ignore this. + * + * Note: if the param_val is a pointer pointing to a buffer, the maximum size + * of the buffer is 4KiB(PAGE_SIZE). + */ +struct amdxdna_drm_config_hwctx { + __u32 handle; + __u32 param_type; + __u64 param_val; + __u32 param_val_size; + __u32 pad; +}; + +/* + * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo + * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory + * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP + * AMDXDNA_BO_CMD: User and driver accessible bo + * AMDXDNA_BO_DMA: DRM GEM DMA bo + */ +enum amdxdna_bo_type { + AMDXDNA_BO_INVALID = 0, + AMDXDNA_BO_SHMEM, + AMDXDNA_BO_DEV_HEAP, + AMDXDNA_BO_DEV, + AMDXDNA_BO_CMD, + AMDXDNA_BO_DMA, +}; + +/** + * struct amdxdna_drm_create_bo - Create a buffer object. + * @flags: Buffer flags. MBZ. + * @type: Buffer type. + * @vaddr: User VA of buffer if applied. MBZ. + * @size: Size in bytes. + * @handle: Returned DRM buffer object handle. + */ +struct amdxdna_drm_create_bo { + __u64 flags; + __u32 type; + __u32 _pad; + __u64 vaddr; + __u64 size; + __u32 handle; +}; + +/** + * struct amdxdna_drm_get_bo_info - Get buffer object information. + * @ext: MBZ. + * @ext_flags: MBZ. + * @handle: DRM buffer object handle. + * @map_offset: Returned DRM fake offset for mmap(). + * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). + * @xdna_addr: Returned XDNA device virtual address. + */ +struct amdxdna_drm_get_bo_info { + __u64 ext; + __u64 ext_flags; + __u32 handle; + __u32 _pad; + __u64 map_offset; + __u64 vaddr; + __u64 xdna_addr; +}; + +/** + * struct amdxdna_drm_sync_bo - Sync buffer object. + * @handle: Buffer object handle. + * @direction: Direction of sync, can be from device or to device. + * @offset: Offset in the buffer to sync. + * @size: Size in bytes. + */ +struct amdxdna_drm_sync_bo { + __u32 handle; +#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_FROM_DEVICE 1U + __u32 direction; + __u64 offset; + __u64 size; +}; + +enum amdxdna_cmd_type { + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, + AMDXDNA_CMD_SUBMIT_DEPENDENCY, + AMDXDNA_CMD_SUBMIT_SIGNAL, +}; + +/** + * struct amdxdna_drm_exec_cmd - Execute command. + * @ext: MBZ. + * @ext_flags: MBZ. + * @hwctx: Hardware context handle. + * @type: One of command type in enum amdxdna_cmd_type. + * @cmd_handles: Array of command handles or the command handle itself in case + * of just one. + * @args: Array of arguments for all command handles. + * @cmd_count: Number of command handles in the cmd_handles array. + * @arg_count: Number of arguments in the args array. + * @seq: Returned sequence number for this command. + */ +struct amdxdna_drm_exec_cmd { + __u64 ext; + __u64 ext_flags; + __u32 hwctx; + __u32 type; + __u64 cmd_handles; + __u64 args; + __u32 cmd_count; + __u32 arg_count; + __u64 seq; +}; + +/** + * struct amdxdna_drm_wait_cmd - Wait exectuion command. + * + * @hwctx: hardware context handle. + * @timeout: timeout in ms, 0 implies infinite wait. + * @seq: sequence number of the command returned by execute command. + * + * Wait a command specified by seq to be completed. + */ +struct amdxdna_drm_wait_cmd { + __u32 hwctx; + __u32 timeout; + __u64 seq; +}; + +/** + * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware + * @buffer: The user space buffer that will return the AIE status + * @buffer_size: The size of the user space buffer + * @cols_filled: A bitmap of AIE columns whose data has been returned in the + * buffer. + */ +struct amdxdna_drm_query_aie_status { + __u64 buffer; /* out */ + __u32 buffer_size; /* in */ + __u32 cols_filled; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware + * @major: The major version number + * @minor: The minor version number + */ +struct amdxdna_drm_query_aie_version { + __u32 major; /* out */ + __u32 minor; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile + * (core, mem, shim) + * @row_count: The number of rows. + * @row_start: The starting row number. + * @dma_channel_count: The number of dma channels. + * @lock_count: The number of locks. + * @event_reg_count: The number of events. + * @pad: MBZ. + */ +struct amdxdna_drm_query_aie_tile_metadata { + __u16 row_count; + __u16 row_start; + __u16 dma_channel_count; + __u16 lock_count; + __u16 event_reg_count; + __u16 pad[3]; +}; + +/** + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE + * hardware + * @col_size: The size of a column in bytes. + * @cols: The total number of columns. + * @rows: The total number of rows. + * @version: The version of the AIE hardware. + * @core: The metadata for all core tiles. + * @mem: The metadata for all mem tiles. + * @shim: The metadata for all shim tiles. + */ +struct amdxdna_drm_query_aie_metadata { + __u32 col_size; + __u16 cols; + __u16 rows; + struct amdxdna_drm_query_aie_version version; + struct amdxdna_drm_query_aie_tile_metadata core; + struct amdxdna_drm_query_aie_tile_metadata mem; + struct amdxdna_drm_query_aie_tile_metadata shim; +}; + +/** + * struct amdxdna_drm_query_clock - Metadata for a clock + * @name: The clock name. + * @freq_mhz: The clock frequency. + * @pad: MBZ. + */ +struct amdxdna_drm_query_clock { + __u8 name[16]; + __u32 freq_mhz; + __u32 pad; +}; + +/** + * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks + * @mp_npu_clock: The metadata for MP-NPU clock. + * @h_clock: The metadata for H clock. + */ +struct amdxdna_drm_query_clock_metadata { + struct amdxdna_drm_query_clock mp_npu_clock; + struct amdxdna_drm_query_clock h_clock; +}; + +enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; + +/** + * struct amdxdna_drm_query_sensor - The data for single sensor. + * @label: The name for a sensor. + * @input: The current value of the sensor. + * @max: The maximum value possible for the sensor. + * @average: The average value of the sensor. + * @highest: The highest recorded sensor value for this driver load for the + * sensor. + * @status: The sensor status. + * @units: The sensor units. + * @unitm: Translates value member variables into the correct unit via (pow(10, + * unitm) * value) + * @type: The sensor type from enum amdxdna_sensor_type + * @pad: MBZ. + */ +struct amdxdna_drm_query_sensor { + __u8 label[64]; + __u32 input; + __u32 max; + __u32 average; + __u32 highest; + __u8 status[64]; + __u8 units[16]; + __s8 unitm; + __u8 type; + __u8 pad[6]; +}; + +/** + * struct amdxdna_drm_query_hwctx - The data for single context. + * @context_id: The ID for this context. + * @start_col: The starting column for the partition assigned to this context. + * @num_col: The number of columns in the partition assigned to this context. + * @pid: The Process ID of the process that created this context. + * @command_submissions: The number of commands submitted to this context. + * @command_completions: The number of commands completed by this context. + * @migrations: The number of times this context has been moved to a different + * partition. + * @preemptions: The number of times this context has been preempted by another + * context in the same partition. + * @pad: MBZ. + */ +struct amdxdna_drm_query_hwctx { + __u32 context_id; + __u32 start_col; + __u32 num_col; + __u32 pad; + __s64 pid; + __u64 command_submissions; + __u64 command_completions; + __u64 migrations; + __u64 preemptions; + __u64 errors; +}; + +/** + * struct amdxdna_drm_aie_mem - The data for AIE memory read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE memory address to read/write + * @size: The size of bytes to read/write + * @buf_p: The buffer to store read/write data + * + * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM + * parameters. + */ +struct amdxdna_drm_aie_mem { + __u32 col; + __u32 row; + __u32 addr; + __u32 size; + __u64 buf_p; +}; + +/** + * struct amdxdna_drm_aie_reg - The data for AIE register read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE register address to read/write + * @val: The value to write or returned value from AIE + * + * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG + * parameters. + */ +struct amdxdna_drm_aie_reg { + __u32 col; + __u32 row; + __u32 addr; + __u32 val; +}; + +enum amdxdna_power_mode_type { + POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ + POWER_MODE_LOW, /**< Set frequency to lowest DPM */ + POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ + POWER_MODE_HIGH, /**< Set frequency to highest DPM */ + POWER_MODE_TURBO, /**< More power, more performance */ +}; + +/** + * struct amdxdna_drm_get_power_mode - Get the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_get_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +/** + * struct amdxdna_drm_query_firmware_version - Query the version of the firmware + * @major: The major version number + * @minor: The minor version number + * @patch: The patch level version number + * @build: The build ID + */ +struct amdxdna_drm_query_firmware_version { + __u32 major; /* out */ + __u32 minor; /* out */ + __u32 patch; /* out */ + __u32 build; /* out */ +}; + +enum amdxdna_drm_get_param { + DRM_AMDXDNA_QUERY_AIE_STATUS, + DRM_AMDXDNA_QUERY_AIE_METADATA, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_CLOCK_METADATA, + DRM_AMDXDNA_QUERY_SENSORS, + DRM_AMDXDNA_QUERY_HW_CONTEXTS, + DRM_AMDXDNA_READ_AIE_MEM, + DRM_AMDXDNA_READ_AIE_REG, + DRM_AMDXDNA_QUERY_FIRMWARE_VERSION, + DRM_AMDXDNA_GET_POWER_MODE, + DRM_AMDXDNA_QUERY_TELEMETRY, + DRM_AMDXDNA_NUM_GET_PARAM, +}; + +/** + * struct amdxdna_drm_get_info - Get some information from the AIE hardware. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. Size needed/written by the kernel. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_get_info { + __u32 param; /* in */ + __u32 buffer_size; /* in/out */ + __u64 buffer; /* in/out */ +}; + +/** + * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_set_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +enum amdxdna_drm_set_param { + DRM_AMDXDNA_SET_POWER_MODE, + DRM_AMDXDNA_WRITE_AIE_MEM, + DRM_AMDXDNA_WRITE_AIE_REG, + DRM_AMDXDNA_NUM_SET_PARAM, +}; + +/** + * struct amdxdna_drm_set_state - Set the state of some component within the AIE + * hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_set_state { + __u32 param; /* in */ + __u32 buffer_size; /* in */ + __u64 buffer; /* in */ +}; + +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) + +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) + +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) + +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) + +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) + +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) + +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) + +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) + +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) + +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) + +#if defined(__cplusplus) +} /* extern c end */ +#endif + +#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp new file mode 100644 index 000000000..3076a386a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "bo.h" + +#include +#include +#include +#include +#include + +#include "shim_debug.h" + +namespace { + +uint32_t alloc_drm_bo(const shim_xdna::pdev& dev, amdxdna_bo_type type, + void* buf, size_t size) { + amdxdna_drm_create_bo cbo = { + .type = type, + .vaddr = reinterpret_cast(buf), + .size = size, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CREATE_BO, &cbo); + return cbo.handle; +} + +void free_drm_bo(const shim_xdna::pdev& dev, uint32_t boh) { + drm_gem_close close_bo = {boh, 0}; + dev.ioctl(DRM_IOCTL_GEM_CLOSE, &close_bo); +} + +void get_drm_bo_info(const shim_xdna::pdev& dev, uint32_t boh, + amdxdna_drm_get_bo_info* bo_info) { + bo_info->handle = boh; + dev.ioctl(DRM_IOCTL_AMDXDNA_GET_BO_INFO, bo_info); +} + +void* map_parent_range(size_t size) { + auto p = ::mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (!p) shim_err(errno, "mmap(len=%ld) failed", size); + + return p; +} + +void* map_drm_bo(const shim_xdna::pdev& dev, size_t size, int prot, + uint64_t offset) { + return dev.mmap(0, size, prot, MAP_SHARED | MAP_LOCKED, offset); +} + +void* map_drm_bo(const shim_xdna::pdev& dev, void* addr, size_t size, int prot, + int flags, uint64_t offset) { + return dev.mmap(addr, size, prot, flags, offset); +} + +void unmap_drm_bo(const shim_xdna::pdev& dev, void* addr, size_t size) { + dev.munmap(addr, size); +} + +void attach_dbg_drm_bo(const shim_xdna::pdev& dev, uint32_t boh, + uint32_t ctx_id) { + amdxdna_drm_config_hwctx adbo = { + .handle = ctx_id, + .param_type = DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + .param_val = boh, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); +} + +void detach_dbg_drm_bo(const shim_xdna::pdev& dev, uint32_t boh, + uint32_t ctx_id) { + amdxdna_drm_config_hwctx adbo = { + .handle = ctx_id, + .param_type = DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + .param_val = boh, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); +} + +int export_drm_bo(const shim_xdna::pdev& dev, uint32_t boh) { + drm_prime_handle exp_bo = {boh, DRM_RDWR | DRM_CLOEXEC, -1}; + dev.ioctl(DRM_IOCTL_PRIME_HANDLE_TO_FD, &exp_bo); + return exp_bo.fd; +} + +uint32_t import_drm_bo(const shim_xdna::pdev& dev, + const shim_xdna::shared_handle& share, + amdxdna_bo_type* type, size_t* size) { + shim_xdna::shared_handle::export_handle fd = share.get_export_handle(); + drm_prime_handle imp_bo = {AMDXDNA_INVALID_BO_HANDLE, 0, fd}; + dev.ioctl(DRM_IOCTL_PRIME_FD_TO_HANDLE, &imp_bo); + + *type = AMDXDNA_BO_SHMEM; + *size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + + return imp_bo.handle; +} + +bool is_power_of_two(size_t x) { return (x > 0) && ((x & (x - 1)) == 0); } + +void* addr_align(void* p, size_t align) { + if (!is_power_of_two(align)) + shim_err(EINVAL, "Alignment 0x%lx is not power of two", align); + + return (void*)(((uintptr_t)p + align) & ~(align - 1)); +} + +amdxdna_bo_type flag_to_type(uint64_t bo_flags) { + auto flags = shim_xdna::xcl_bo_flags{bo_flags}; + auto boflags = (static_cast(flags.boflags) << 24); + switch (boflags) { + case XCL_BO_FLAGS_NONE: + case XCL_BO_FLAGS_HOST_ONLY: + return AMDXDNA_BO_SHMEM; + case XCL_BO_FLAGS_CACHEABLE: + return AMDXDNA_BO_DEV; + case XCL_BO_FLAGS_EXECBUF: + return AMDXDNA_BO_CMD; + default: + break; + } + return AMDXDNA_BO_INVALID; +} + +// flash cache line for non coherence memory +inline void clflush_data(const void* base, size_t offset, size_t len) { + static long cacheline_size = 0; + + if (!cacheline_size) { + long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sz <= 0) shim_err(EINVAL, "Invalid cache line size: %ld", sz); + cacheline_size = sz; + } + + const char* cur = (const char*)base; + cur += offset; + uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1); + do { + _mm_clflush(cur); + cur += cacheline_size; + } while (cur <= (const char*)lastline); +} + +void sync_drm_bo(const shim_xdna::pdev& dev, uint32_t boh, + shim_xdna::bo::direction dir, size_t offset, size_t len) { + amdxdna_drm_sync_bo sbo = { + .handle = boh, + .direction = (dir == shim_xdna::bo::direction::host2device + ? SYNC_DIRECT_TO_DEVICE + : SYNC_DIRECT_FROM_DEVICE), + .offset = offset, + .size = len, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_SYNC_BO, &sbo); +} + +bool is_driver_sync() { + static int drv_sync = -1; + + if (drv_sync == -1) { + bool ds = std::getenv("Debug.force_driver_sync"); + drv_sync = ds ? 1 : 0; + } + return drv_sync == 1; +} + +} // namespace + +namespace shim_xdna { + +bo::drm_bo::drm_bo(bo& parent, const amdxdna_drm_get_bo_info& bo_info) + : m_parent(parent), + m_handle(bo_info.handle), + m_map_offset(bo_info.map_offset), + m_vaddr(bo_info.vaddr), + m_xdna_addr(bo_info.xdna_addr) {} + +bo::drm_bo::~drm_bo() { + if (m_handle == AMDXDNA_INVALID_BO_HANDLE) return; + free_drm_bo(m_parent.m_pdev, m_handle); +} + +std::string bo::type_to_name() const { + switch (m_type) { + case AMDXDNA_BO_SHMEM: + return std::string("AMDXDNA_BO_SHMEM"); + case AMDXDNA_BO_DEV_HEAP: + return std::string("AMDXDNA_BO_DEV_HEAP"); + case AMDXDNA_BO_DEV: + if (xcl_bo_flags{m_flags}.use == XRT_BO_USE_DEBUG) + return std::string("AMDXDNA_BO_DEV_DEBUG"); + return std::string("AMDXDNA_BO_DEV"); + case AMDXDNA_BO_CMD: + return std::string("AMDXDNA_BO_CMD"); + } + return std::string("BO_UNKNOWN"); +} + +std::string bo::describe() const { + std::string desc = "type="; + desc += type_to_name(); + desc += ", "; + desc += "drm_bo="; + desc += std::to_string(m_bo->m_handle); + desc += ", "; + desc += "size="; + desc += std::to_string(m_aligned_size); + return desc; +} + +void bo::mmap_bo(size_t align) { + size_t a = align; + + if (m_bo->m_map_offset == AMDXDNA_INVALID_ADDR) { + m_aligned = reinterpret_cast(m_bo->m_vaddr); + return; + } + + if (a == 0) { + m_aligned = map_drm_bo(m_pdev, m_aligned_size, PROT_READ | PROT_WRITE, + m_bo->m_map_offset); + return; + } + + /* + * Handle special alignment + * The first mmap() is just for reserved a range in user vritual address + * space. The second mmap() uses an aligned addr as the first argument in mmap + * syscall. + */ + m_parent_size = align * 2 - 1; + m_parent = map_parent_range(m_parent_size); + auto aligned = addr_align(m_parent, align); + m_aligned = + map_drm_bo(m_pdev, aligned, m_aligned_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, m_bo->m_map_offset); +} + +void bo::munmap_bo() { + shim_debug("Unmap BO, aligned %p parent %p", m_aligned, m_parent); + if (m_bo->m_map_offset == AMDXDNA_INVALID_ADDR) return; + + unmap_drm_bo(m_pdev, m_aligned, m_aligned_size); + if (m_parent) unmap_drm_bo(m_pdev, m_parent, m_parent_size); +} + +void bo::alloc_bo() { + uint32_t boh = alloc_drm_bo(m_pdev, m_type, NULL, m_aligned_size); + + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_bo = std::make_unique(*this, bo_info); +} + +void bo::import_bo() { + uint32_t boh = import_drm_bo(m_pdev, m_import, &m_type, &m_aligned_size); + + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_bo = std::make_unique(*this, bo_info); +} + +void bo::free_bo() { m_bo.reset(); } + +bo::properties bo::get_properties() const { + return {m_flags, m_aligned_size, get_paddr(), get_drm_bo_handle()}; +} + +void* bo::map(bo::map_type type) { + if (type != bo::map_type::write) + shim_err( + EINVAL, + "Not support map BO as readonly. Type must be bo::map_type::write"); + return m_aligned; +} + +void bo::unmap(void* addr) {} + +uint64_t bo::get_paddr() const { + if (m_bo->m_xdna_addr != AMDXDNA_INVALID_ADDR) return m_bo->m_xdna_addr; + return reinterpret_cast(m_aligned); +} + +void bo::set_cmd_id(uint64_t id) { m_cmd_id = id; } + +uint64_t bo::get_cmd_id() const { return m_cmd_id; } + +uint32_t bo::get_drm_bo_handle() const { return m_bo->m_handle; } + +void bo::attach_to_ctx() { + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; + + auto boh = get_drm_bo_handle(); + shim_debug("Attaching drm_bo %d to ctx: %d", boh, m_owner_ctx_id); + attach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); +} + +void bo::detach_from_ctx() { + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; + + auto boh = get_drm_bo_handle(); + shim_debug("Detaching drm_bo %d from ctx: %d", boh, m_owner_ctx_id); + detach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); +} + +std::unique_ptr bo::share() const { + auto boh = get_drm_bo_handle(); + auto fd = export_drm_bo(m_pdev, boh); + shim_debug("Exported bo %d to fd %d", boh, fd); + return std::make_unique(fd); +} + +amdxdna_bo_type bo::get_type() const { return m_type; } + +bo::bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, + uint64_t flags) + : bo(device, ctx_id, size, flags, flag_to_type(flags)) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + +bo::bo(const device& device, size_t size, amdxdna_bo_type type) + : bo(device, AMDXDNA_INVALID_CTX_HANDLE, size, 0, type) {} + +bo::bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, + uint64_t flags, amdxdna_bo_type type) + : m_pdev(device.get_pdev()), + m_aligned_size(size), + m_flags(flags), + m_type(type), + m_import(-1), + m_owner_ctx_id(ctx_id) { + size_t align = 0; + + if (m_type == AMDXDNA_BO_DEV_HEAP) + align = 64 * 1024 * 1024; // Device mem heap must align at 64MB boundary. + + alloc_bo(); + mmap_bo(align); + + // Newly allocated buffer may contain dirty pages. If used as output buffer, + // the data in cacheline will be flushed onto memory and pollute the output + // from device. We perform a cache flush right after the BO is allocated to + // avoid this issue. + if (m_type == AMDXDNA_BO_SHMEM) sync(direction::host2device, size, 0); + + attach_to_ctx(); + + shim_debug( + "Allocated KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " + "drm_bo=%d)", + m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); +} + +bo::bo(const device& device, shared_handle::export_handle ehdl) + : m_pdev(device.get_pdev()), m_import(ehdl) { + import_bo(); + mmap_bo(); + shim_debug( + "Imported KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " + "drm_bo=%d)", + m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); +} + +bo::~bo() { + shim_debug("Freeing KMQ BO, %s", describe().c_str()); + + munmap_bo(); + detach_from_ctx(); + // If BO is in use, we should block and wait in driver + free_bo(); +} + +void bo::sync(direction dir, size_t size, size_t offset) { + if (is_driver_sync()) { + sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); + return; + } + + if (offset + size > m_aligned_size) + shim_err(EINVAL, "Invalid BO offset and size for sync'ing: %ld, %ld", + offset, size); + + switch (m_type) { + case AMDXDNA_BO_SHMEM: + case AMDXDNA_BO_CMD: + clflush_data(m_aligned, offset, size); + break; + case AMDXDNA_BO_DEV: + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) + clflush_data(m_aligned, offset, size); + else + sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); + break; + default: + shim_err(ENOTSUP, "Can't sync bo type %d", m_type); + } +} + +void bo::bind_at(size_t pos, const bo* bh, size_t offset, size_t size) { + auto boh = reinterpret_cast(bh); + std::lock_guard lg(m_args_map_lock); + + if (m_type != AMDXDNA_BO_CMD) + shim_err(EINVAL, "Can't call bind_at() on non-cmd BO"); + + if (!pos) m_args_map.clear(); + + if (boh->get_type() != AMDXDNA_BO_CMD) { + auto h = boh->get_drm_bo_handle(); + m_args_map[pos] = h; + shim_debug("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); + } else { + const size_t max_args_order = 6; + const size_t max_args = 1 << max_args_order; + size_t key = pos << max_args_order; + uint32_t hs[max_args]; + auto arg_cnt = boh->get_arg_bo_handles(hs, max_args); + std::string bohs; + for (int i = 0; i < arg_cnt; i++) { + m_args_map[key + i] = hs[i]; + bohs += std::to_string(hs[i]) + " "; + } + shim_debug("Added arg BO %s to cmd BO %d", bohs.c_str(), + get_drm_bo_handle()); + } +} + +uint32_t bo::get_arg_bo_handles(uint32_t* handles, size_t num) const { + std::lock_guard lg(m_args_map_lock); + + auto sz = m_args_map.size(); + if (sz > num) + shim_err(E2BIG, "There are %ld BO args, provided buffer can hold only %ld", + sz, num); + + for (auto m : m_args_map) *(handles++) = m.second; + + return sz; +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h new file mode 100644 index 000000000..2f513ae8d --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _BO_XDNA_H_ +#define _BO_XDNA_H_ + +#include +#include + +#include "amdxdna_accel.h" +#include "device.h" +#include "hwctx.h" +#include "pcidev.h" +#include "shared.h" +#include "shim_debug.h" + +namespace shim_xdna { + +#define XRT_BO_USE_NORMAL 0 +#define XRT_BO_USE_DEBUG 1 + +/** + * XCL BO Flags bits layout + * + * bits 0 ~ 15: DDR BANK index + * bits 24 ~ 31: BO flags + */ +#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) +#define XCL_BO_FLAGS_NONE (0) +#define XCL_BO_FLAGS_CACHEABLE (1U << 24) +#define XCL_BO_FLAGS_KERNBUF (1U << 25) +#define XCL_BO_FLAGS_SGL (1U << 26) +#define XCL_BO_FLAGS_SVM (1U << 27) +#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) +#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) +#define XCL_BO_FLAGS_P2P (1U << 30) +#define XCL_BO_FLAGS_EXECBUF (1U << 31) + +/** + * Encoding of flags passed to xcl buffer allocation APIs + */ +struct xcl_bo_flags { + union { + uint64_t all; // [63-0] + + struct { + uint32_t flags; // [31-0] + uint32_t extension; // [63-32] + }; + + struct { + uint16_t bank; // [15-0] + uint8_t slot; // [23-16] + uint8_t boflags; // [31-24] + + // extension + uint32_t access : 2; // [33-32] + uint32_t dir : 2; // [35-34] + uint32_t use : 1; // [36] + uint32_t unused : 27; // [63-35] + }; + }; +}; + +struct bo { + // map_type - determines how a buffer is mapped + enum class map_type { read, write }; + + enum xclBOSyncDirection { + XCL_BO_SYNC_BO_TO_DEVICE = 0, + XCL_BO_SYNC_BO_FROM_DEVICE, + XCL_BO_SYNC_BO_GMIO_TO_AIE, + XCL_BO_SYNC_BO_AIE_TO_GMIO, + }; + + // direction - direction of sync operation + enum class direction { + host2device = XCL_BO_SYNC_BO_TO_DEVICE, + device2host = XCL_BO_SYNC_BO_FROM_DEVICE, + }; + + // properties - buffer details + struct properties { + uint64_t flags; // flags of bo + uint64_t size; // size of bo + uint64_t paddr; // physical address + uint64_t kmhdl; // kernel mode handle + }; + + bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, uint64_t flags, + amdxdna_bo_type type); + + bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, uint64_t flags); + + bo(const device& device, shared_handle::export_handle ehdl); + + ~bo(); + + void* map(map_type); + + void unmap(void* addr); + + void sync(direction, size_t size, size_t offset); + + properties get_properties() const; + + std::unique_ptr share() const; + + void copy(const bo* src, size_t size, size_t dst_offset, size_t src_offset) { + shim_not_supported_err(__func__); + } + + // For cmd BO only + void set_cmd_id(uint64_t id); + // For cmd BO only + uint64_t get_cmd_id() const; + + uint32_t get_drm_bo_handle() const; + + amdxdna_bo_type get_type() const; + + // DRM BO managed by driver. + struct drm_bo { + public: + bo& m_parent; + uint32_t m_handle = AMDXDNA_INVALID_BO_HANDLE; + off_t m_map_offset = AMDXDNA_INVALID_ADDR; + uint64_t m_xdna_addr = AMDXDNA_INVALID_ADDR; + uint64_t m_vaddr = AMDXDNA_INVALID_ADDR; + + drm_bo(bo& parent, const amdxdna_drm_get_bo_info& bo_info); + ~drm_bo(); + }; + + std::string describe() const; + + // Alloc DRM BO from driver + void alloc_bo(); + + // Import DRM BO from m_import shared object + void import_bo(); + + // Free DRM BO in driver + void free_bo(); + + void mmap_bo(size_t align = 0); + + void munmap_bo(); + + uint64_t get_paddr() const; + + std::string type_to_name() const; + + void attach_to_ctx(); + + void detach_from_ctx(); + + const pdev& m_pdev; + void* m_parent = nullptr; + void* m_aligned = nullptr; + size_t m_parent_size = 0; + size_t m_aligned_size = 0; + uint64_t m_flags = 0; + amdxdna_bo_type m_type = AMDXDNA_BO_INVALID; + std::unique_ptr m_bo; + const shared_handle m_import; + + // Command ID in the queue after command submission. + // Only valid for cmd BO. + uint64_t m_cmd_id = -1; + + // Used when exclusively assigned to a HW context. By default, BO is shared + // among all HW contexts. + hw_ctx::slot_id m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; + + void bind_at(size_t pos, const bo* bh, size_t offset, size_t size); + + // Support BO creation from internal + bo(const device& device, size_t size, amdxdna_bo_type type); + + // Obtain array of arg BO handles, returns real number of handles + uint32_t get_arg_bo_handles(uint32_t* handles, size_t num) const; + + // Only for AMDXDNA_BO_CMD type + std::map m_args_map; + mutable std::mutex m_args_map_lock; +}; + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp new file mode 100644 index 000000000..20ec767ea --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. - All rights reserved + +#include "device.h" + +#include +#include + +#include +#include + +#include "bo.h" +#include "hwctx.h" + +namespace shim_xdna { + +device::device(const pdev& pdev, handle_type shim_handle) + : m_pdev(pdev), m_handle(shim_handle) { + shim_debug("Created KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str()); +} + +device::~device() { + shim_debug("Destroying KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str()); + m_pdev.close(); +} + +std::unique_ptr device::create_hw_context( + const device& dev, const hw_ctx::qos_type& qos) const { + return std::make_unique(dev, qos); +} + +std::unique_ptr device::alloc_bo(void* userptr, hw_ctx::slot_id ctx_id, + size_t size, uint64_t flags) { + if (userptr) shim_not_supported_err("User ptr BO"); + + auto b = bo(this->m_pdev, ctx_id, size, flags); + return std::make_unique(*this, ctx_id, size, flags); +} + +std::unique_ptr device::import_bo(shared_handle::export_handle ehdl) const { + return std::make_unique(*this, ehdl); +} + +std::vector device::read_aie_mem(uint16_t col, uint16_t row, + uint32_t offset, uint32_t size) { + amdxdna_drm_aie_mem mem; + std::vector store_buf(size); + + mem.col = col; + mem.row = row; + mem.addr = offset; + mem.size = size; + mem.buf_p = reinterpret_cast(store_buf.data()); + + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_MEM, + .buffer_size = sizeof(mem), + .buffer = reinterpret_cast(&mem)}; + + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); + + return store_buf; +} + +uint32_t device::read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr) { + amdxdna_drm_aie_reg reg; + + reg.col = col; + reg.row = row; + reg.addr = reg_addr; + reg.val = 0; + + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_REG, + .buffer_size = sizeof(reg), + .buffer = reinterpret_cast(®)}; + + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); + + return reg.val; +} + +size_t device::write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + const std::vector& buf) { + amdxdna_drm_aie_mem mem; + uint32_t size = static_cast(buf.size()); + + mem.col = col; + mem.row = row; + mem.addr = offset; + mem.size = size; + mem.buf_p = reinterpret_cast(buf.data()); + + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_MEM, + .buffer_size = sizeof(mem), + .buffer = reinterpret_cast(&mem)}; + + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); + + return size; +} + +bool device::write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, + uint32_t reg_val) { + amdxdna_drm_aie_reg reg; + + reg.col = col; + reg.row = row; + reg.addr = reg_addr; + reg.val = reg_val; + + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_REG, + .buffer_size = sizeof(reg), + .buffer = reinterpret_cast(®)}; + + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); + + return true; +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h new file mode 100644 index 000000000..8db0f2227 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef PCIE_DEVICE_LINUX_XDNA_H +#define PCIE_DEVICE_LINUX_XDNA_H + +#include + +#include "pcidev.h" +#include "shim_debug.h" + +namespace shim_xdna { + +typedef void* xclDeviceHandle; +#define XRT_NULL_HANDLE NULL + +// cuidx_type - encode cuidx and domain +// +// @domain_index: index within domain +// @domain: domain identifier +// @index: combined encoded index +// +// The domain_index is used in command cumask in exec_buf +// The combined index is used in context creation in open_context +struct cuidx_type { + union { + std::uint32_t index; + struct { + std::uint16_t domain_index; // [15-0] + std::uint16_t domain; // [31-16] + }; + }; + + // Ensure consistent use of domain and index types + using domain_type = uint16_t; + using domain_index_type = uint16_t; +}; + +struct device { + // device index type + using id_type = unsigned int; + using slot_id = uint32_t; + using handle_type = xclDeviceHandle; + + device(const pdev& pdev, handle_type shim_handle); + + ~device(); + + std::unique_ptr alloc_bo(void* userptr, hw_ctx::slot_id ctx_id, + size_t size, uint64_t flags); + + std::unique_ptr create_hw_context(const device& dev, + const hw_ctx::qos_type& qos) const; + + std::unique_ptr import_bo(shared_handle::export_handle ehdl) const; + + const pdev& get_pdev() const; + + std::unique_ptr alloc_bo(size_t size, uint64_t flags); + + std::unique_ptr import_bo(pid_t, shared_handle::export_handle); + + std::unique_ptr create_hw_context(const hw_ctx::qos_type& qos, + hw_ctx::access_mode mode) const; + + std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + uint32_t size); + + size_t write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + const std::vector& buf); + + uint32_t read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr); + + bool write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, + uint32_t reg_val); + + const pdev& m_pdev; // The pcidev that this device object is derived from + std::map m_bo_map; + xclDeviceHandle m_handle = XRT_NULL_HANDLE; + + mutable std::mutex m_mutex; +}; + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h new file mode 100644 index 000000000..ac5858db4 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h @@ -0,0 +1,1176 @@ +/* + * Copyright (C) 2019-2022, Xilinx Inc + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + * + * Apache License Verbiage + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * GPL license Verbiage: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. This program is + * distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. You should have received a copy of the + * GNU General Public License along with this program; if not, write + * to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + * + */ + +/** + * DOC: XRT Embedded Runtime definition + * + * Header file *ert.h* defines data structures used by Emebdded Runtime (ERT) and + * XRT xclExecBuf() API. + */ + +#ifndef _ERT_H_ +#define _ERT_H_ + +#if defined(__linux__) && defined(__KERNEL__) +# include +#elif defined(__windows__) && defined(_KERNEL_MODE) +# include +#elif defined(__cplusplus) && !defined(_KERNEL_MODE) +# include +# include +#else +# include +# include +# include +#endif + +#ifdef _WIN32 +# pragma warning( push ) +# pragma warning( disable : 4200 4201 ) +#endif + +#if defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wpedantic" +#endif + +#define to_cfg_pkg(pkg) \ + ((struct ert_configure_cmd *)(pkg)) +#define to_start_krnl_pkg(pkg) \ + ((struct ert_start_kernel_cmd *)(pkg)) +#define to_copybo_pkg(pkg) \ + ((struct ert_start_copybo_cmd *)(pkg)) +#define to_cfg_sk_pkg(pkg) \ + ((struct ert_configure_sk_cmd *)(pkg)) +#define to_init_krnl_pkg(pkg) \ + ((struct ert_init_kernel_cmd *)(pkg)) +#define to_validate_pkg(pkg) \ + ((struct ert_validate_cmd *)(pkg)) +#define to_abort_pkg(pkg) \ + ((struct ert_abort_cmd *)(pkg)) + + +#define HOST_RW_PATTERN 0xF0F0F0F0 +#define DEVICE_RW_PATTERN 0x0F0F0F0F + +/** + * struct ert_packet: ERT generic packet format + * + * @state: [3-0] current state of a command + * @custom: [11-4] custom per specific commands + * @count: [22-12] number of words in payload (data) + * @opcode: [27-23] opcode identifying specific command + * @type: [31-28] type of command (currently 0) + * @data: count number of words representing packet payload + */ +struct ert_packet { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t custom:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-28] */ + }; + uint32_t header; + }; + uint32_t data[1]; /* count number of words */ +}; + +/** + * struct ert_start_kernel_cmd: ERT start kernel command format + * + * @state: [3-0] current state of a command + * @stat_enabled: [4] enabled driver to record timestamp for various + * states cmd has gone through. The stat data + * is appended after cmd data. + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header for cmd data. Not + * include stat data. + * @opcode: [27-23] 0, opcode for start_kernel + * @type: [31-27] 0, type of start_kernel + * + * @cu_mask: first mandatory CU mask + * @data: count-1 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, a mandatory CU mask, + * and extra_cu_masks per header field, followed by a CU register map of size + * (count - (1 + extra_cu_masks)) uint32_t words. + */ +struct ert_start_kernel_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t stat_enabled:1; /* [4] */ + uint32_t unused:5; /* [9-5] */ + uint32_t extra_cu_masks:2; /* [11-10] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-1 number of words */ +}; + +/** + * struct ert_dpu_data - interpretation of data payload for ERT_START_DPU + * + * @instruction_buffer: address of instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @chained: number of following ert_dpu_data elements + * + * The ert_dpu_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_dpu_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_DPU is interpreted as fixed instruction + * buffer address along with instruction count, followed by regular kernel + * arguments. + */ +struct ert_dpu_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t chained; /* number of following ert_dpu_data elements */ +}; + +/** + * struct ert_npu_data - interpretation of data payload for ERT_START_NPU + * + * @instruction_buffer: address of instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @instruction_prop_count: WORD length of property name value pairs + * + * The ert_npu_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_npu_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_NPU is interpreted as instruction + * buffer address, instruction count along with instruction property, + * followed by regular kernel arguments. + * + * When instruction_prop_count is non-zero, it indicates the length + * (in 32 bits WORD) of the instruction buffer properties after this + * fields. This count is reserved for future extension. One example + * propertiy is the number of actual columns this instruction used. + */ +struct ert_npu_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t instruction_prop_count; /* WORD length of following properties nv pairs */ +}; + +/** + * struct ert_npu_preempt_data - interpretation of data payload for ERT_START_NPU_PREEMPT + * + * @instruction_buffer: address of instruction buffer + * @save_buffer: address of save instruction buffer + * @restore_buffer: address of restrore instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @save_buffer_size: size of save instruction buffer in bytes + * @restore_buffer_size: size of restore instruction buffer in bytes + * @instruction_prop_count: number of property name value pairs + * + * The ert_npu_preempt_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_npu_preempt_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_NPU_PREEMPT is interpreted as instruction + * buffer, save instruction buffer, restore instruction buffer and their + * size, along with instruction property, followed by regular kernel arguments. + * + * When instruction_prop_count is non-zero, it indicates the length + * (in 32 bits WORD) of the instruction buffer properties after this + * fields. This count is reserved for future extension. One example + * propertiy is the number of actual columns this instruction used. + */ +struct ert_npu_preempt_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint64_t save_buffer; /* buffer address 2 words */ + uint64_t restore_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t save_buffer_size; /* size of buffer in bytes */ + uint32_t restore_buffer_size; /* size of buffer in bytes */ + uint32_t instruction_prop_count; /* DWORD length of following properties nv pairs */ +}; + +/** + * struct ert_cmd_chain_data - interpretation of data payload for ERT_CMD_CHAIN + * + * @command_count: number of commands in chain + * @submit_index: index of last successfully submitted command in chain + * @error_index: index of failing command if cmd status is not completed + * @data[]: address of each command in chain + * + * This is the payload of an *ert_packet* when the opcode is ERT_CMD_CHAIN + */ +struct ert_cmd_chain_data { + uint32_t command_count; + uint32_t submit_index; + uint32_t error_index; + uint32_t reserved[3]; + uint64_t data[]; +}; + +#ifndef U30_DEBUG +#define ert_write_return_code(cmd, value) \ +do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ + skcmd->data[end_idx] = value; \ +} while (0) + +#define ert_read_return_code(cmd, ret) \ +do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ + ret = skcmd->data[end_idx]; \ +} while (0) +#else +/* These are for debug legacy U30 firmware */ +#define ert_write_return_code(cmd, value) \ +do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + skcmd->cu_mask = value; \ +} while (0) + +#define ert_read_return_code(cmd, ret) \ +do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + ret = skcmd->cu_mask; \ +} while (0) +#endif + +/** + * struct ert_init_kernel_cmd: ERT initialize kernel command format + * this command initializes CUs by writing CU registers. CUs are + * represented by cu_mask and extra_cu_masks. + * + * @state: [3-0] current state of a command + * @update_rtp: [4] command is for runtime update of cu argument + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header + * @opcode: [27-23] 0, opcode for init_kernel + * @type: [31-27] 0, type of init_kernel + * + * @cu_run_timeout the configured CU timeout value in Microseconds + * setting to 0 means CU should not timeout + * @cu_reset_timeout the configured CU reset timeout value in Microseconds + * when CU timeout, CU will be reset. this indicates + * CU reset should be completed within the timeout value. + * if cu_run_timeout is set to 0, this field is undefined. + * + * @cu_mask: first mandatory CU mask + * @data: count-9 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, 8 reserved fields, + * a mandatory CU mask, and extra_cu_masks per header field, followed by a + * CU register map of size (count - (9 + extra_cu_masks)) uint32_t words. + */ +struct ert_init_kernel_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t update_rtp:1; /* [4] */ + uint32_t unused:5; /* [9-5] */ + uint32_t extra_cu_masks:2; /* [11-10] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + + uint32_t cu_run_timeout; /* CU timeout value in Microseconds */ + uint32_t cu_reset_timeout; /* CU reset timeout value in Microseconds */ + uint32_t reserved[6]; /* reserved for future use */ + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-9 number of words */ +}; + +#define KDMA_BLOCK_SIZE 64 /* Limited by KDMA CU */ +struct ert_start_copybo_cmd { + uint32_t state:4; /* [3-0], must be ERT_CMD_STATE_NEW */ + uint32_t unused:6; /* [9-4] */ + uint32_t extra_cu_masks:2; /* [11-10], = 3 */ + uint32_t count:11; /* [22-12], = 16, exclude 'arg' */ + uint32_t opcode:5; /* [27-23], = ERT_START_COPYBO */ + uint32_t type:4; /* [31-27], = ERT_DEFAULT */ + uint32_t cu_mask[4]; /* mandatory cu masks */ + uint32_t reserved[4]; /* for scheduler use */ + uint32_t src_addr_lo; /* low 32 bit of src addr */ + uint32_t src_addr_hi; /* high 32 bit of src addr */ + uint32_t src_bo_hdl; /* src bo handle, cleared by driver */ + uint32_t dst_addr_lo; /* low 32 bit of dst addr */ + uint32_t dst_addr_hi; /* high 32 bit of dst addr */ + uint32_t dst_bo_hdl; /* dst bo handle, cleared by driver */ + uint32_t size; /* size in bytes low 32 bit*/ + uint32_t size_hi; /* size in bytes high 32 bit*/ + void *arg; /* pointer to aux data for KDS */ +}; + +/** + * struct ert_configure_cmd: ERT configure command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload (5 + num_cus) + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @slot_size: command queue slot size + * @num_cus: number of compute units in program + * @cu_shift: shift value to convert CU idx to CU addr + * @cu_base_addr: base address to add to CU addr for actual physical address + * + * @ert:1 enable embedded HW scheduler + * @polling:1 poll for command completion + * @cu_dma:1 enable CUDMA custom module for HW scheduler + * @cu_isr:1 enable CUISR custom module for HW scheduler + * @cq_int:1 enable interrupt from host to HW scheduler + * @cdma:1 enable CDMA kernel + * @unused:25 + * @dsa52:1 reserved for internal use + * + * @data: addresses of @num_cus CUs + */ +struct ert_configure_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t unused:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t slot_size; + uint32_t num_cus; + uint32_t cu_shift; + uint32_t cu_base_addr; + + /* features */ + uint32_t ert:1; + uint32_t polling:1; + uint32_t cu_dma:1; + uint32_t cu_isr:1; + uint32_t cq_int:1; + uint32_t cdma:1; + uint32_t dataflow:1; + /* WORKAROUND: allow xclRegWrite/xclRegRead access shared CU */ + uint32_t rw_shared:1; + uint32_t kds_30:1; + uint32_t dmsg:1; + uint32_t echo:1; + uint32_t intr:1; + uint32_t unusedf:19; + uint32_t dsa52:1; + + /* cu address map size is num_cus */ + uint32_t data[1]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * This data struct is obsoleted. Only used in legacy ERT firmware. + * Use 'struct config_sk_image_uuid' instead on XGQ based ERT. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + */ +struct config_sk_image { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + * @sk_uuid: xclbin uuid that this soft kernel image belones to + */ +struct config_sk_image_uuid { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; + unsigned char sk_uuid[16]; + uint32_t slot_id; +}; + +/** + * struct ert_configure_sk_cmd: ERT configure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @num_image: number of images +*/ +struct ert_configure_sk_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t unused:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t num_image; + struct config_sk_image image[1]; +}; + +/** + * struct ert_unconfigure_sk_cmd: ERT unconfigure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @start_cuidx: start index of compute units + * @num_cus: number of compute units in program + */ +struct ert_unconfigure_sk_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t unused:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t start_cuidx; + uint32_t num_cus; +}; + +/** + * struct ert_abort_cmd: ERT abort command format. + * + * @exec_bo_handle: The bo handle of execbuf command to abort + */ +struct ert_abort_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t custom:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint64_t exec_bo_handle; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_validate_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t custom:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t timestamp; + uint32_t cq_read_single; + uint32_t cq_write_single; + uint32_t cu_read_single; + uint32_t cu_write_single; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_access_valid_cmd { + union { + struct { + uint32_t state:4; /* [3-0] */ + uint32_t custom:8; /* [11-4] */ + uint32_t count:11; /* [22-12] */ + uint32_t opcode:5; /* [27-23] */ + uint32_t type:4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t h2h_access; + uint32_t h2d_access; + uint32_t d2h_access; + uint32_t d2d_access; + uint32_t d2cu_access; + uint32_t wr_count; + uint32_t wr_test; +}; + +/** + * ERT command state + * + * @ERT_CMD_STATE_NEW: Set by host before submitting a command to + * scheduler + * @ERT_CMD_STATE_QUEUED: Internal scheduler state + * @ERT_CMD_STATE_SUBMITTED: Internal scheduler state + * @ERT_CMD_STATE_RUNNING: Internal scheduler state + * @ERT_CMD_STATE_COMPLETED: Set by scheduler when command completes + * @ERT_CMD_STATE_ERROR: Set by scheduler if command failed + * @ERT_CMD_STATE_ABORT: Set by scheduler if command abort + * @ERT_CMD_STATE_TIMEOUT: Set by scheduler if command timeout and reset + * @ERT_CMD_STATE_NORESPONSE: Set by scheduler if command timeout and fail to + * reset + */ +enum ert_cmd_state { + ERT_CMD_STATE_NEW = 1, + ERT_CMD_STATE_QUEUED = 2, + ERT_CMD_STATE_RUNNING = 3, + ERT_CMD_STATE_COMPLETED = 4, + ERT_CMD_STATE_ERROR = 5, + ERT_CMD_STATE_ABORT = 6, + ERT_CMD_STATE_SUBMITTED = 7, + ERT_CMD_STATE_TIMEOUT = 8, + ERT_CMD_STATE_NORESPONSE = 9, + ERT_CMD_STATE_SKERROR = 10, //Check for error return code from Soft Kernel + ERT_CMD_STATE_SKCRASHED = 11, //Soft kernel has crashed + ERT_CMD_STATE_MAX, // Always the last one +}; + +struct cu_cmd_state_timestamps { + uint64_t skc_timestamps[ERT_CMD_STATE_MAX]; // In nano-second +}; + +/** + * Opcode types for commands + * + * @ERT_START_CU: start a workgroup on a CU + * @ERT_START_KERNEL: currently aliased to ERT_START_CU + * @ERT_CONFIGURE: configure command scheduler + * @ERT_EXEC_WRITE: execute a specified CU after writing + * @ERT_CU_STAT: get stats about CU execution + * @ERT_START_COPYBO: start KDMA CU or P2P, may be converted to ERT_START_CU + * before cmd reach to scheduler, short-term hack + * @ERT_SK_CONFIG: configure soft kernel + * @ERT_SK_START: start a soft kernel + * @ERT_SK_UNCONFIG: unconfigure a soft kernel + * @ERT_START_KEY_VAL: same as ERT_START_CU but with key-value pair flavor + * @ERT_START_DPU: instruction buffer command format + * @ERT_CMD_CHAIN: command chain + * @ERT_START_NPU: instruction buffer command format on NPU format + * @ERT_START_NPU_PREEMPT: instruction buffer command with preemption format on NPU + */ +enum ert_cmd_opcode { + ERT_START_CU = 0, + ERT_START_KERNEL = 0, + ERT_CONFIGURE = 2, + ERT_EXIT = 3, + ERT_ABORT = 4, + ERT_EXEC_WRITE = 5, + ERT_CU_STAT = 6, + ERT_START_COPYBO = 7, + ERT_SK_CONFIG = 8, + ERT_SK_START = 9, + ERT_SK_UNCONFIG = 10, + ERT_INIT_CU = 11, + ERT_START_FA = 12, + ERT_CLK_CALIB = 13, + ERT_MB_VALIDATE = 14, + ERT_START_KEY_VAL = 15, + ERT_ACCESS_TEST_C = 16, + ERT_ACCESS_TEST = 17, + ERT_START_DPU = 18, + ERT_CMD_CHAIN = 19, + ERT_START_NPU = 20, + ERT_START_NPU_PREEMPT = 21, +}; + +/** + * Command types + * + * @ERT_DEFAULT: default command type + * @ERT_KDS_LOCAL: command processed by KDS locally + * @ERT_CTRL: control command uses reserved command queue slot + * @ERT_CU: compute unit command + */ +enum ert_cmd_type { + ERT_DEFAULT = 0, + ERT_KDS_LOCAL = 1, + ERT_CTRL = 2, + ERT_CU = 3, + ERT_SCU = 4, +}; + +/** + * Soft kernel types + * + * @SOFTKERNEL_TYPE_EXEC: executable + */ +enum softkernel_type { + SOFTKERNEL_TYPE_EXEC = 0, +}; + +/* + * Base address GPIO per spec + * | Offset | Description + * ----------------------- + * | 0x00 | ERT_MGMT_PF_base_addr (Not sure where this should be use) + * | 0x08 | ERT_USER_PF_base_addr. The base address of ERT peripherals + */ +#if defined(ERT_BUILD_V20) +uint32_t ert_base_addr = 0; +# define ERT_BASE_ADDR 0x01F30008 +#endif + +#if defined(ERT_BUILD_V30) +uint32_t ert_base_addr = 0; +# define ERT_BASE_ADDR 0x01F30008 +#endif + +/** + * Address constants per spec + */ +#define ERT_WORD_SIZE 4 /* 4 bytes */ +#define ERT_CQ_SIZE 0x10000 /* 64K */ +#if defined(ERT_BUILD_U50) +# define ERT_CQ_BASE_ADDR 0x340000 +# define ERT_CSR_ADDR 0x360000 +#elif defined(ERT_BUILD_V20) +# define ERT_CQ_BASE_ADDR (0x000000 + ert_base_addr) +# define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#elif defined(ERT_BUILD_V30) +# define ERT_CQ_BASE_ADDR 0x1F60000 +# define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#else +# define ERT_CQ_BASE_ADDR 0x190000 +# define ERT_CSR_ADDR 0x180000 +#endif + +/** + * The STATUS REGISTER is for communicating completed CQ slot indices + * MicroBlaze write, host reads. MB(W) / HOST(COR) + */ +#define ERT_STATUS_REGISTER_ADDR (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR0 (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR1 (ERT_CSR_ADDR + 0x4) +#define ERT_STATUS_REGISTER_ADDR2 (ERT_CSR_ADDR + 0x8) +#define ERT_STATUS_REGISTER_ADDR3 (ERT_CSR_ADDR + 0xC) + +/** + * The CU DMA REGISTER is for communicating which CQ slot is to be started + * on a specific CU. MB selects a free CU on which the command can + * run, then writes the 1<state = ERT_CMD_STATE_NEW; + pkt->extra_cu_masks = 3; + pkt->count = 16; + pkt->opcode = ERT_START_COPYBO; + pkt->type = ERT_DEFAULT; + pkt->cu_mask[0] = 0; + pkt->cu_mask[1] = 0; + pkt->cu_mask[2] = 0; + pkt->cu_mask[3] = 0; + pkt->src_addr_lo = (uint32_t)src_offset; + pkt->src_addr_hi = (src_offset >> 32) & 0xFFFFFFFF; + pkt->src_bo_hdl = src_bo; + pkt->dst_addr_lo = (uint32_t)dst_offset; + pkt->dst_addr_hi = (dst_offset >> 32) & 0xFFFFFFFF; + pkt->dst_bo_hdl = dst_bo; + pkt->size = size; + pkt->size_hi = 0; /* set to 0 explicitly */ + pkt->arg = 0; +} +static inline uint64_t +ert_copybo_src_offset(struct ert_start_copybo_cmd *pkt) +{ + return (uint64_t)pkt->src_addr_hi << 32 | pkt->src_addr_lo; +} +static inline uint64_t +ert_copybo_dst_offset(struct ert_start_copybo_cmd *pkt) +{ + return (uint64_t)pkt->dst_addr_hi << 32 | pkt->dst_addr_lo; +} +static inline uint64_t +ert_copybo_size(struct ert_start_copybo_cmd *pkt) +{ + return pkt->size; +} + +static inline bool +ert_valid_opcode(struct ert_packet *pkt) +{ + struct ert_start_kernel_cmd *skcmd; + struct ert_init_kernel_cmd *ikcmd; + struct ert_start_copybo_cmd *sccmd; + struct ert_configure_cmd *ccmd; + struct ert_configure_sk_cmd *cscmd; + struct ert_cmd_chain_data *ccdata; + bool valid; + + switch (pkt->opcode) { + case ERT_START_CU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 4 registers */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 4); + break; + case ERT_START_DPU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + size (in words) of ert_dpu_data */ + valid = (skcmd->count >= 1+ skcmd->extra_cu_masks + sizeof(struct ert_dpu_data) / sizeof(uint32_t)); + break; + case ERT_CMD_CHAIN: + ccdata = (struct ert_cmd_chain_data*) pkt->data; + /* header count must match number of commands in payload */ + valid = (pkt->count == (ccdata->command_count * sizeof(uint64_t) + sizeof(struct ert_cmd_chain_data)) / sizeof(uint32_t)); + break; + case ERT_START_NPU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + ert_npu_data */ + valid = (skcmd->count >= 1+ skcmd->extra_cu_masks + sizeof(struct ert_npu_data) / sizeof(uint32_t)); + break; + case ERT_START_NPU_PREEMPT: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + ert_npu_preempt_data */ + valid = (skcmd->count >= 1+ skcmd->extra_cu_masks + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t)); + break; + case ERT_START_KEY_VAL: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1); + break; + case ERT_EXEC_WRITE: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 6 registers */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 6); + break; + case ERT_START_FA: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1); + break; + case ERT_SK_START: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 1 control word */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 1); + break; + case ERT_CONFIGURE: + ccmd = to_cfg_pkg(pkt); + /* 5 mandatory fields in struct */ + valid = (ccmd->count >= 5 + ccmd->num_cus); + break; + case ERT_START_COPYBO: + sccmd = to_copybo_pkg(pkt); + valid = (sccmd->count == 16); + break; + case ERT_INIT_CU: + ikcmd = to_init_krnl_pkg(pkt); + /* 9 mandatory words in struct + 4 control registers */ + valid = (ikcmd->count >= ikcmd->extra_cu_masks + 9 + 4); + break; + case ERT_SK_CONFIG: + cscmd = to_cfg_sk_pkg(pkt); + valid = (cscmd->count == sizeof(struct config_sk_image) * cscmd->num_image / 4 + 1); + break; + case ERT_CLK_CALIB: + case ERT_MB_VALIDATE: + case ERT_ACCESS_TEST_C: + case ERT_CU_STAT: /* TODO: Rules to validate? */ + case ERT_EXIT: + case ERT_ABORT: + valid = true; + break; + case ERT_SK_UNCONFIG: /* NOTE: obsolete */ + default: + valid = false; + } + + return valid; +} + +static inline uint64_t +get_ert_packet_size_bytes(struct ert_packet *pkt) +{ + // header plus payload + return sizeof(pkt->header) + pkt->count * sizeof(uint32_t); +} + +static inline struct ert_dpu_data* +get_ert_dpu_data(struct ert_start_kernel_cmd* pkt) +{ + if (pkt->opcode != ERT_START_DPU) + return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_dpu_data*) (pkt->data + pkt->extra_cu_masks); +} + +static inline struct ert_dpu_data* +get_ert_dpu_data_next(struct ert_dpu_data* dpu_data) +{ + if (dpu_data->chained == 0) + return NULL; + + return dpu_data + 1; +} + +static inline struct ert_cmd_chain_data* +get_ert_cmd_chain_data(struct ert_packet* pkt) +{ + if (pkt->opcode != ERT_CMD_CHAIN) + return NULL; + + return (struct ert_cmd_chain_data*) pkt->data; +} + +static inline struct ert_npu_data* +get_ert_npu_data(struct ert_start_kernel_cmd* pkt) +{ + if (pkt->opcode != ERT_START_NPU) + return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_npu_data*) (pkt->data + pkt->extra_cu_masks); +} + +static inline struct ert_npu_preempt_data* +get_ert_npu_preempt_data(struct ert_start_kernel_cmd* pkt) +{ + if (pkt->opcode != ERT_START_NPU_PREEMPT) + return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_npu_preempt_data*) (pkt->data + pkt->extra_cu_masks); +} + +static inline uint32_t* +get_ert_regmap_begin(struct ert_start_kernel_cmd* pkt) +{ + switch (pkt->opcode) { + case ERT_START_DPU: + return pkt->data + pkt->extra_cu_masks + + (get_ert_dpu_data(pkt)->chained + 1) * sizeof(struct ert_dpu_data) / sizeof(uint32_t); + + case ERT_START_NPU: + return pkt->data + pkt->extra_cu_masks + + sizeof(struct ert_npu_data) / sizeof(uint32_t) + + get_ert_npu_data(pkt)->instruction_prop_count; + + case ERT_START_NPU_PREEMPT: + return pkt->data + pkt->extra_cu_masks + + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t) + + get_ert_npu_preempt_data(pkt)->instruction_prop_count; + + default: + // skip past embedded extra cu_masks + return pkt->data + pkt->extra_cu_masks; + } +} + +static inline uint32_t* +get_ert_regmap_end(struct ert_start_kernel_cmd* pkt) +{ + // pkt->count includes the mandatory cumask which precededs data array + return &pkt->cu_mask + pkt->count; +} + +static inline uint64_t +get_ert_regmap_size_bytes(struct ert_start_kernel_cmd* pkt) +{ + return (get_ert_regmap_end(pkt) - get_ert_regmap_begin(pkt)) * sizeof(uint32_t); +} + +#ifdef __linux__ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +static inline struct cu_cmd_state_timestamps * +ert_start_kernel_timestamps(struct ert_start_kernel_cmd *pkt) +{ + uint64_t offset = pkt->count * sizeof(uint32_t) + sizeof(pkt->header); + /* Make sure the offset of timestamps are properly aligned. */ + return (struct cu_cmd_state_timestamps *) + ((char *)pkt + P2ROUNDUP(offset, sizeof(uint64_t))); +} + +/* Return 0 if this pkt doesn't support timestamp or disabled */ +static inline int +get_size_with_timestamps_or_zero(struct ert_packet *pkt) +{ + struct ert_start_kernel_cmd *skcmd; + int size = 0; + + switch (pkt->opcode) { + case ERT_START_CU: + case ERT_EXEC_WRITE: + case ERT_START_FA: + case ERT_SK_START: + skcmd = to_start_krnl_pkg(pkt); + if (skcmd->stat_enabled) { + size = (char *)ert_start_kernel_timestamps(skcmd) - (char *)pkt; + size += sizeof(struct cu_cmd_state_timestamps); + } + } + + return size; +} +#endif + +#if defined(__GNUC__) +# pragma GCC diagnostic pop +#endif + +#ifdef _WIN32 +# pragma warning( pop ) +#endif + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp new file mode 100644 index 000000000..4af239eb1 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fence.h" + +#include + +#include "amdxdna_accel.h" + +namespace { + +uint32_t create_syncobj(const shim_xdna::pdev& dev) { + drm_syncobj_create csobj = {.handle = AMDXDNA_INVALID_FENCE_HANDLE, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_CREATE, &csobj); + return csobj.handle; +} + +void destroy_syncobj(const shim_xdna::pdev& dev, uint32_t hdl) { + drm_syncobj_destroy dsobj = {.handle = hdl}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_DESTROY, &dsobj); +} + +uint64_t query_syncobj_timeline(const shim_xdna::pdev& dev, uint32_t sobj_hdl) { + uint64_t point = 0; + drm_syncobj_timeline_array sobjs = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&point), + .count_handles = 1, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_QUERY, &sobjs); + return point; +} + +int export_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl) { + drm_syncobj_handle esobj = { + .handle = sobj_hdl, + .flags = 0, + .fd = -1, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &esobj); + return esobj.fd; +} + +uint32_t import_syncobj(const shim_xdna::pdev& dev, int fd) { + drm_syncobj_handle isobj = { + .handle = AMDXDNA_INVALID_FENCE_HANDLE, + .flags = 0, + .fd = fd, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &isobj); + return isobj.handle; +} + +void signal_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl, + uint64_t timepoint) { + drm_syncobj_timeline_array sobjs = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&timepoint), + .count_handles = 1, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &sobjs); +} + +void wait_syncobj_done(const shim_xdna::pdev& dev, uint32_t sobj_hdl, + uint64_t timepoint) { + drm_syncobj_timeline_wait wsobj = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&timepoint), + .timeout_nsec = std::numeric_limits::max(), /* wait forever */ + .count_handles = 1, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); +} + +void wait_syncobj_available(const shim_xdna::pdev& dev, + const uint32_t* sobj_hdls, + const uint64_t* timepoints, uint32_t num) { + drm_syncobj_timeline_wait wsobj = { + .handles = reinterpret_cast(sobj_hdls), + .points = reinterpret_cast(timepoints), + .timeout_nsec = std::numeric_limits::max(), /* wait forever */ + .count_handles = num, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); +} + +void submit_wait_syncobjs(const shim_xdna::pdev& dev, + const shim_xdna::hw_ctx* ctx, + const uint32_t* sobj_hdls, const uint64_t* points, + uint32_t num) { + wait_syncobj_available(dev, sobj_hdls, points, num); + + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->get_slotidx(), + .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY, + .cmd_handles = reinterpret_cast(sobj_hdls), + .args = reinterpret_cast(points), + .cmd_count = num, + .arg_count = num, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); +} + +void submit_signal_syncobj(const shim_xdna::pdev& dev, + const shim_xdna::hw_ctx* ctx, uint32_t sobj_hdl, + uint64_t point) { + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->get_slotidx(), + .type = AMDXDNA_CMD_SUBMIT_SIGNAL, + .cmd_handles = sobj_hdl, + .args = point, + .cmd_count = 1, + .arg_count = 1, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); +} + +} // namespace + +namespace shim_xdna { + +fence::fence(const device& device) + : m_pdev(device.get_pdev()), + m_import(std::make_unique(-1)), + m_syncobj_hdl(create_syncobj(m_pdev)) { + shim_debug("Fence allocated: %d@%d", m_syncobj_hdl, m_state); +} + +fence::fence(const device& device, shared_handle::export_handle ehdl) + : m_pdev(device.get_pdev()), + m_import(std::make_unique(ehdl)), + m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())) { + shim_debug("Fence imported: %d@%ld", m_syncobj_hdl, m_state); +} + +fence::fence(const fence& f) + : m_pdev(f.m_pdev), + m_import(f.share()), + m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())), + m_state{f.m_state}, + m_signaled{f.m_signaled} { + shim_debug("Fence cloned: %d@%ld", m_syncobj_hdl, m_state); +} + +fence::~fence() { + shim_debug("Fence going away: %d@%ld", m_syncobj_hdl, m_state); + destroy_syncobj(m_pdev, m_syncobj_hdl); +} + +std::unique_ptr fence::share() const { + if (m_state != initial_state) + shim_err(-EINVAL, "Can't share fence not at initial state."); + + return std::make_unique(export_syncobj(m_pdev, m_syncobj_hdl)); +} + +uint64_t fence::get_next_state() const { return m_state + 1; } + +std::unique_ptr fence::clone() const { + return std::make_unique(*this); +} + +uint64_t fence::wait_next_state() const { + std::lock_guard guard(m_lock); + + if (m_state != initial_state && m_signaled) + shim_err(-EINVAL, "Can't wait on fence that has been signaled before."); + return ++m_state; +} + +// Timeout value is ignored for now. +void fence::wait(uint32_t timeout_ms) const { + auto st = signal_next_state(); + shim_debug("Waiting for command fence %d@%ld", m_syncobj_hdl, st); + wait_syncobj_done(m_pdev, m_syncobj_hdl, st); +} + +void fence::submit_wait(const hw_ctx* ctx) const { + auto st = signal_next_state(); + shim_debug("Submitting wait for command fence %d@%ld", m_syncobj_hdl, st); + submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1); +} + +uint64_t fence::signal_next_state() const { + std::lock_guard guard(m_lock); + + if (m_state != initial_state && !m_signaled) + shim_err(-EINVAL, "Can't signal fence that has been waited before."); + if (m_state == initial_state) m_signaled = true; + return ++m_state; +} + +void fence::signal() const { + auto st = signal_next_state(); + shim_debug("Signaling command fence %d@%ld", m_syncobj_hdl, st); + signal_syncobj(m_pdev, m_syncobj_hdl, st); +} + +void fence::submit_signal(const hw_ctx* ctx) const { + auto st = signal_next_state(); + shim_debug("Submitting signal command fence %d@%ld", m_syncobj_hdl, st); + submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st); +} + +void fence::submit_wait(const pdev& dev, const hw_ctx* ctx, + const std::vector& fences) { + constexpr int max_fences = 1024; + uint32_t hdls[max_fences]; + uint64_t pts[max_fences]; + int i = 0; + + if (fences.size() > max_fences) + shim_err(-EINVAL, "Too many fences in one submit: %d", fences.size()); + + for (auto f : fences) { + auto fh = static_cast(f); + auto st = fh->wait_next_state(); + shim_debug("Waiting for command fence %d@%ld", fh->m_syncobj_hdl, st); + hdls[i] = fh->m_syncobj_hdl; + pts[i] = st; + i++; + } + submit_wait_syncobjs(dev, ctx, hdls, pts, i); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h new file mode 100644 index 000000000..2b9067c1c --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _FENCE_XDNA_H_ +#define _FENCE_XDNA_H_ + +#include + +#include "device.h" +#include "hwctx.h" +#include "shared.h" +#include "shim_debug.h" + +namespace shim_xdna { + +struct fence { + using export_handle = shared_handle::export_handle; + enum class access_mode : uint8_t { local, shared, process, hybrid }; + + fence(const device& device); + + fence(const device& device, shared_handle::export_handle ehdl); + + ~fence(); + + std::unique_ptr clone() const; + + std::unique_ptr share() const; + + void wait(uint32_t timeout_ms) const; + + uint64_t get_next_state() const; + + void signal() const; + + void submit_wait(const hw_ctx*) const; + + static void submit_wait(const pdev& dev, const hw_ctx*, + const std::vector& fences); + + void submit_signal(const hw_ctx*) const; + + uint64_t wait_next_state() const; + + uint64_t signal_next_state() const; + + const pdev& m_pdev; + const std::unique_ptr m_import; + uint32_t m_syncobj_hdl; + + // Protecting below mutables + mutable std::mutex m_lock; + // Set once at first signal + mutable bool m_signaled = false; + // Ever incrementing at each wait/signal + static constexpr uint64_t initial_state = 0; + mutable uint64_t m_state = initial_state; +}; + +} // namespace shim_xdna + +#endif // _FENCE_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp new file mode 100644 index 000000000..a10abf3ae --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "hwctx.h" + +#include "bo.h" +#include "hwq.h" + +namespace { + +std::vector get_pdi(const xrt_core::xclbin::aie_partition_obj& aie, + uint16_t kernel_id) { + for (auto& pdi : aie.pdis) { + for (auto& cdo : pdi.cdo_groups) { + for (auto kid : cdo.kernel_ids) { + if (kid == kernel_id) return pdi.pdi; + } + } + } + shim_err(ENOENT, "PDI for kernel ID 0x%x not found", kernel_id); +} + +} // namespace +namespace shim_xdna { + +hw_ctx::hw_ctx(const device& dev, const qos_type& qos, std::unique_ptr q) + : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { + shim_debug("Creating HW context..."); + init_qos_info(qos); + parse_xclbin(xclbin); +} + +hw_ctx::~hw_ctx() { + delete_ctx_on_device(); + shim_debug("Destroyed HW context (%d)...", m_handle); +} + +hw_ctx::slot_id hw_ctx::get_slotidx() const { return m_handle; } + +void hw_ctx::set_slotidx(slot_id id) { m_handle = id; } + +cuidx_type hw_ctx::open_cu_context(const std::string& cu_name) { + for (uint32_t i = 0; i < m_cu_info.size(); i++) { + auto& ci = m_cu_info[i]; + if (ci.m_name == cu_name) return cuidx_type{.index = i}; + } + + shim_err(ENOENT, "CU name (%s) not found", cu_name.c_str()); +} + +void hw_ctx::close_cu_context(cuidx_type cuidx) { + // Nothing to be done +} + +std::unique_ptr hw_ctx::alloc_bo(size_t size, uint64_t flags) { + return alloc_bo(nullptr, size, flags); +} + +std::unique_ptr hw_ctx::import_bo(pid_t pid, + shared_handle::export_handle ehdl) { + // const_cast: import_bo() is not const yet in device class + auto& dev = const_cast(get_device()); + return dev.import_bo(pid, ehdl); +} + +hw_q* hw_ctx::get_hw_queue() { return m_q.get(); } + +void hw_ctx::init_qos_info(const qos_type& qos) { + for (auto& [key, value] : qos) { + if (key == "gops") + m_qos.gops = value; + else if (key == "fps") + m_qos.fps = value; + else if (key == "dma_bandwidth") + m_qos.dma_bandwidth = value; + else if (key == "latency") + m_qos.latency = value; + else if (key == "frame_execution_time") + m_qos.frame_exec_time = value; + else if (key == "priority") + m_qos.priority = value; + } +} + +void hw_ctx::print_xclbin_info() { + if (m_cu_info.empty()) { + shim_debug("CU INFO is empty"); + return; + } + + for (int idx = 0; idx < m_cu_info.size(); idx++) { + auto& e = m_cu_info[idx]; + shim_debug("index=%d, name=%s, func=%d, pdi(p=%p, sz=%ld)", idx, + e.m_name.c_str(), e.m_func, e.m_pdi.data(), e.m_pdi.size()); + } + shim_debug("OPs/cycle: %d", m_ops_per_cycle); +} + +const device& hw_ctx::get_device() { return m_device; } + +const std::vector& hw_ctx::get_cu_info() const { + return m_cu_info; +} + +void hw_ctx::create_ctx_on_device() { + amdxdna_drm_create_hwctx arg = {}; + arg.qos_p = reinterpret_cast(&m_qos); + arg.umq_bo = m_q->get_queue_bo(); + arg.max_opc = m_ops_per_cycle; + arg.num_tiles = + m_num_cols * + xrt_core::device_query(&m_device) + .core_rows; + arg.log_buf_bo = m_log_bo + ? static_cast(m_log_bo.get())->get_drm_bo_handle() + : AMDXDNA_INVALID_BO_HANDLE; + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &arg); + + set_slotidx(arg.handle); + set_doorbell(arg.umq_doorbell); + + m_q->bind_hwctx(this); +} + +void hw_ctx::delete_ctx_on_device() { + if (m_handle == AMDXDNA_INVALID_CTX_HANDLE) return; + + m_q->unbind_hwctx(); + struct amdxdna_drm_destroy_hwctx arg = {}; + arg.handle = m_handle; + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &arg); + + fini_log_buf(); +} + +void hw_ctx::init_log_buf() { + auto log_buf_size = m_num_cols * 1024; + m_log_bo = alloc_bo(nullptr, log_buf_size, XCL_BO_FLAGS_EXECBUF); + m_log_buf = m_log_bo->map(bo::map_type::write); + std::memset(m_log_buf, 0, log_buf_size); +} + +void hw_ctx::fini_log_buf(void) { + if (m_log_bo) m_log_bo->unmap(m_log_buf); +} + +void hw_ctx::set_doorbell(uint32_t db) { m_doorbell = db; } + +uint32_t hw_ctx::get_doorbell() const { return m_doorbell; } + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h new file mode 100644 index 000000000..446d64f58 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _HWCTX_XDNA_H_ +#define _HWCTX_XDNA_H_ + +#include + +#include "amdxdna_accel.h" +#include "device.h" +#include "shim_debug.h" + +namespace shim_xdna { + +struct hw_q; // forward declaration + +struct hw_ctx { + using qos_type = std::map; + enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; + using access_mode = access_mode; + using slot_id = uint32_t; + + hw_ctx(const device& dev, const qos_type& qos, std::unique_ptr q); + + ~hw_ctx(); + + // TODO + void update_qos(const qos_type&) { shim_not_supported_err(__func__); } + + void update_access_mode(access_mode) { shim_not_supported_err(__func__); } + + slot_id get_slotidx() const; + + hw_q* get_hw_queue(); + + std::unique_ptr alloc_bo(void* userptr, size_t size, uint64_t flags); + + std::unique_ptr alloc_bo(size_t size, uint64_t flags); + + std::unique_ptr import_bo(pid_t, shared_handle::export_handle); + + cuidx_type open_cu_context(const std::string& cuname); + + void close_cu_context(cuidx_type cuidx); + + void exec_buf(bo*) { shim_not_supported_err(__func__); } + + uint32_t get_doorbell() const; + + const device& get_device(); + + struct cu_info { + std::string m_name; + size_t m_func; + std::vector m_pdi; + }; + + const std::vector& get_cu_info() const; + + void set_slotidx(slot_id id); + + void set_doorbell(uint32_t db); + + void create_ctx_on_device(); + + void init_log_buf(); + + void fini_log_buf(); + + const device& m_device; + slot_id m_handle = AMDXDNA_INVALID_CTX_HANDLE; + amdxdna_qos_info m_qos = {}; + std::vector m_cu_info; + std::unique_ptr m_q; + uint32_t m_ops_per_cycle; + uint32_t m_num_cols; + uint32_t m_doorbell; + std::unique_ptr m_log_bo; + void* m_log_buf; + + void delete_ctx_on_device(); + + void init_qos_info(const qos_type& qos); +}; + +} // namespace shim_xdna + +#endif // _HWCTX_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp new file mode 100644 index 000000000..d52c2a8ac --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "hwq.h" + +#include "bo.h" +#include "ert.h" +#include "fence.h" +#include "shim_debug.h" + +namespace { + +ert_packet *get_chained_command_pkt(shim_xdna::bo *boh) { + auto cmdpkt = + reinterpret_cast(boh->map(shim_xdna::bo::map_type::write)); + return cmdpkt->opcode == ERT_CMD_CHAIN ? cmdpkt : nullptr; +} + +int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, + shim_xdna::bo *cmd, uint32_t timeout_ms) { + int ret = 1; + auto boh = static_cast(cmd); + auto id = boh->get_cmd_id(); + + shim_debug("Waiting for cmd (%ld)...", id); + + amdxdna_drm_wait_cmd wcmd = { + .hwctx = ctx->get_slotidx(), + .timeout = timeout_ms, + .seq = boh->get_cmd_id(), + }; + + pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd); + return ret; +} + +} // namespace + +namespace shim_xdna { + +hw_q::hw_q(const device &device) + : m_hwctx(nullptr), + m_queue_boh(AMDXDNA_INVALID_BO_HANDLE), + m_pdev(device.get_pdev()) {} + +void hw_q::bind_hwctx(const hw_ctx *ctx) { + m_hwctx = ctx; + shim_debug("Bond HW queue to HW context %d", m_hwctx->get_slotidx()); +} + +void hw_q::unbind_hwctx() { + shim_debug("Unbond HW queue from HW context %d", m_hwctx->get_slotidx()); + m_hwctx = nullptr; +} + +uint32_t hw_q::get_queue_bo() { return m_queue_boh; } + +void hw_q::submit_command(bo *cmd) { issue_command(cmd); } + +int hw_q::poll_command(bo *cmd) const { + auto cmdpkt = reinterpret_cast(cmd->map(bo::map_type::write)); + + if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) { + return 1; + } + return 0; +} + +int hw_q::wait_command(bo *cmd, uint32_t timeout_ms) const { + if (poll_command(cmd)) return 1; + return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms); +} + +void hw_q::submit_wait(const fence *f) { + auto fh = static_cast(f); + fh->submit_wait(m_hwctx); +} + +void hw_q::submit_wait(const std::vector &fences) { + fence::submit_wait(m_pdev, m_hwctx, fences); +} + +void hw_q::submit_signal(const fence *f) { + auto fh = static_cast(f); + fh->submit_signal(m_hwctx); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h new file mode 100644 index 000000000..30b3cfbf9 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _HWQ_XDNA_H_ +#define _HWQ_XDNA_H_ + +#include "fence.h" +#include "hwctx.h" +#include "shim_debug.h" + +namespace shim_xdna { + +struct hw_q { + hw_q(const device &device); + + void submit_command(bo *); + + int poll_command(bo *) const; + + int wait_command(bo *, uint32_t timeout_ms) const; + + void submit_wait(const fence *); + + void submit_wait(const std::vector &); + + void submit_signal(const fence *); + + virtual void bind_hwctx(const hw_ctx *ctx) = 0; + + void unbind_hwctx(); + + uint32_t get_queue_bo(); + + virtual void issue_command(bo *) = 0; + + const hw_ctx *m_hwctx; + const pdev &m_pdev; + uint32_t m_queue_boh; +}; + +} // namespace shim_xdna + +#endif // _HWQ_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp new file mode 100644 index 000000000..e057c61ac --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pcidev.h" + +#include +#include +#include +#include + +#include +#include + +#include "amdxdna_accel.h" +#include "bo.h" +#include "pcidrv.h" +#include "shim_debug.h" + +namespace { + +std::string ioctl_cmd2name(unsigned long cmd) { + switch (cmd) { + case DRM_IOCTL_AMDXDNA_CREATE_HWCTX: + return "DRM_IOCTL_AMDXDNA_CREATE_HWCTX"; + case DRM_IOCTL_AMDXDNA_DESTROY_HWCTX: + return "DRM_IOCTL_AMDXDNA_DESTROY_HWCTX"; + case DRM_IOCTL_AMDXDNA_CONFIG_HWCTX: + return "DRM_IOCTL_AMDXDNA_CONFIG_HWCTX"; + case DRM_IOCTL_AMDXDNA_CREATE_BO: + return "DRM_IOCTL_AMDXDNA_CREATE_BO"; + case DRM_IOCTL_AMDXDNA_GET_BO_INFO: + return "DRM_IOCTL_AMDXDNA_GET_BO_INFO"; + case DRM_IOCTL_AMDXDNA_SYNC_BO: + return "DRM_IOCTL_AMDXDNA_SYNC_BO"; + case DRM_IOCTL_AMDXDNA_EXEC_CMD: + return "DRM_IOCTL_AMDXDNA_EXEC_CMD"; + case DRM_IOCTL_AMDXDNA_WAIT_CMD: + return "DRM_IOCTL_AMDXDNA_WAIT_CMD"; + case DRM_IOCTL_AMDXDNA_GET_INFO: + return "DRM_IOCTL_AMDXDNA_GET_INFO"; + case DRM_IOCTL_AMDXDNA_SET_STATE: + return "DRM_IOCTL_AMDXDNA_SET_STATE"; + case DRM_IOCTL_GEM_CLOSE: + return "DRM_IOCTL_GEM_CLOSE"; + case DRM_IOCTL_PRIME_HANDLE_TO_FD: + return "DRM_IOCTL_PRIME_HANDLE_TO_FD"; + case DRM_IOCTL_PRIME_FD_TO_HANDLE: + return "DRM_IOCTL_PRIME_FD_TO_HANDLE"; + case DRM_IOCTL_SYNCOBJ_CREATE: + return "DRM_IOCTL_SYNCOBJ_CREATE"; + case DRM_IOCTL_SYNCOBJ_QUERY: + return "DRM_IOCTL_SYNCOBJ_QUERY"; + case DRM_IOCTL_SYNCOBJ_DESTROY: + return "DRM_IOCTL_SYNCOBJ_DESTROY"; + case DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD: + return "DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD"; + case DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE: + return "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE"; + case DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL: + return "DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL"; + case DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT: + return "DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT"; + } + + return "UNKNOWN(" + std::to_string(cmd) + ")"; +} + +size_t bar_size(const std::string& dir, unsigned bar) { + std::ifstream ifs(dir + "/resource"); + if (!ifs.good()) return 0; + std::string line; + for (unsigned i = 0; i <= bar; i++) { + line.clear(); + std::getline(ifs, line); + } + long long start, end, meta; + if (sscanf(line.c_str(), "0x%llx 0x%llx 0x%llx", &start, &end, &meta) != 3) + return 0; + return end - start + 1; +} + +int get_render_value(const std::string& dir, + const std::string& devnode_prefix) { + struct dirent* entry; + DIR* dp; + int instance_num = INVALID_ID; + + dp = opendir(dir.c_str()); + if (dp == nullptr) return instance_num; + + while ((entry = readdir(dp))) { + std::string dirname{entry->d_name}; + if (dirname.compare(0, devnode_prefix.size(), devnode_prefix) == 0) { + instance_num = std::stoi(dirname.substr(devnode_prefix.size())); + break; + } + } + + closedir(dp); + + return instance_num; +} + +bool is_admin() { return (getuid() == 0) || (geteuid() == 0); } + +const size_t dev_mem_size = (64 << 20); + +} // namespace + +namespace sysfs { + +static constexpr const char* dev_root = "/sys/bus/pci/devices/"; + +static std::string get_name(const std::string& dir, const std::string& subdir) { + std::string line; + std::ifstream ifs(dir + "/" + subdir + "/name"); + + if (ifs.is_open()) std::getline(ifs, line); + + return line; +} + +// Helper to find subdevice directory name +// Assumption: all subdevice's sysfs directory name starts with subdevice name!! +static int get_subdev_dir_name(const std::string& dir, + const std::string& subDevName, + std::string& subdir) { + DIR* dp; + size_t sub_nm_sz = subDevName.size(); + + subdir = ""; + if (subDevName.empty()) return 0; + + int ret = -ENOENT; + dp = opendir(dir.c_str()); + if (dp) { + struct dirent* entry; + while ((entry = readdir(dp))) { + std::string nm = get_name(dir, entry->d_name); + if (!nm.empty()) { + if (nm != subDevName) continue; + } else if (strncmp(entry->d_name, subDevName.c_str(), sub_nm_sz) != 0 || + entry->d_name[sub_nm_sz] != '.') { + continue; + } + // found it + subdir = entry->d_name; + ret = 0; + break; + } + closedir(dp); + } + + return ret; +} + +static std::string get_path(const std::string& name, const std::string& subdev, + const std::string& entry) { + std::string subdir; + if (get_subdev_dir_name(dev_root + name, subdev, subdir) != 0) return ""; + + std::string path = dev_root; + path += name; + path += "/"; + path += subdir; + path += "/"; + path += entry; + return path; +} + +static std::fstream open_path(const std::string& path, std::string& err, + bool write, bool binary) { + std::fstream fs; + std::ios::openmode mode = write ? std::ios::out : std::ios::in; + + if (binary) mode |= std::ios::binary; + + err.clear(); + fs.open(path, mode); + if (!fs.is_open()) { + std::stringstream ss; + ss << "Failed to open " << path << " for " << (binary ? "binary " : "") + << (write ? "writing" : "reading") << ": " << strerror(errno) + << std::endl; + err = ss.str(); + } + return fs; +} + +static std::fstream open(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, bool write, + bool binary) { + std::fstream fs; + auto path = get_path(name, subdev, entry); + + if (path.empty()) { + std::stringstream ss; + ss << "Failed to find subdirectory for " << subdev << " under " + << dev_root + name << std::endl; + err = ss.str(); + } else { + fs = open_path(path, err, write, binary); + } + + return fs; +} + +static void get(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, + std::vector& sv) { + std::fstream fs = open(name, subdev, entry, err, false, false); + if (!err.empty()) return; + + sv.clear(); + std::string line; + while (std::getline(fs, line)) sv.push_back(line); +} + +static void get(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, + std::vector& iv) { + iv.clear(); + + std::vector sv; + get(name, subdev, entry, err, sv); + if (!err.empty()) return; + + for (auto& s : sv) { + if (s.empty()) { + std::stringstream ss; + ss << "Reading " << get_path(name, subdev, entry) << ", "; + ss << "can't convert empty string to integer" << std::endl; + err = ss.str(); + break; + } + char* end = nullptr; + auto n = std::strtoull(s.c_str(), &end, 0); + if (*end != '\0') { + std::stringstream ss; + ss << "Reading " << get_path(name, subdev, entry) << ", "; + ss << "failed to convert string to integer: " << s << std::endl; + err = ss.str(); + break; + } + iv.push_back(n); + } +} + +static void get(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, std::string& s) { + std::vector sv; + get(name, subdev, entry, err, sv); + if (!sv.empty()) + s = sv[0]; + else + s = ""; // default value +} + +static void get(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, + std::vector& buf) { + std::fstream fs = open(name, subdev, entry, err, false, true); + if (!err.empty()) return; + + buf.clear(); + buf.insert(std::end(buf), std::istreambuf_iterator(fs), + std::istreambuf_iterator()); +} + +static void put(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, + const std::string& input) { + std::fstream fs = open(name, subdev, entry, err, true, false); + if (!err.empty()) return; + fs << input; + fs.close(); // flush and close, if either fails then stream failbit is set. + if (!fs.good()) { + std::stringstream ss; + ss << "Failed to write " << get_path(name, subdev, entry) << ": " + << strerror(errno) << std::endl; + err = ss.str(); + } +} + +static void put(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, + const std::vector& buf) { + std::fstream fs = open(name, subdev, entry, err, true, true); + if (!err.empty()) return; + + fs.write(buf.data(), buf.size()); + fs.close(); // flush and close, if either fails then stream failbit is set. + if (!fs.good()) { + std::stringstream ss; + ss << "Failed to write " << get_path(name, subdev, entry) << ": " + << strerror(errno) << std::endl; + err = ss.str(); + } +} + +static void put(const std::string& name, const std::string& subdev, + const std::string& entry, std::string& err, + const unsigned int& input) { + std::fstream fs = open(name, subdev, entry, err, true, false); + if (!err.empty()) return; + fs << input; + fs.close(); // flush and close, if either fails then stream failbit is set. + if (!fs.good()) { + std::stringstream ss; + ss << "Failed to write " << get_path(name, subdev, entry) << ": " + << strerror(errno) << std::endl; + err = ss.str(); + } +} + +} // namespace sysfs + +namespace shim_xdna { + +void pdev::sysfs_get(const std::string& subdev, const std::string& entry, + std::string& err, std::vector& ret) const { + sysfs::get(m_sysfs_name, subdev, entry, err, ret); +} + +pdev::pdev(std::shared_ptr driver, std::string sysfs_name) + : m_driver(std::move(driver)), m_sysfs_name(std::move(sysfs_name)) { + std::string err; + + if (sscanf(m_sysfs_name.c_str(), "%hx:%hx:%hx.%hx", &m_domain, &m_bus, &m_dev, + &m_func) < 4) + llvm::report_fatal_error(llvm::Twine(m_sysfs_name) + " is not valid BDF"); + + m_is_mgmt = !m_driver->is_user(); + + if (m_is_mgmt) { + sysfs_get("", "instance", err, m_instance, + static_cast(INVALID_ID)); + } else { + m_instance = get_render_value( + sysfs::dev_root + m_sysfs_name + "/" + m_driver->sysfs_dev_node_dir(), + m_driver->dev_node_prefix()); + } + + sysfs_get("", "userbar", err, m_user_bar, 0); + m_user_bar_size = bar_size(sysfs::dev_root + m_sysfs_name, m_user_bar); + sysfs_get("", "ready", err, m_is_ready, false); + m_user_bar_map = reinterpret_cast(MAP_FAILED); + m_is_ready = true; // We're always ready. +} + +pdev::~pdev() { + if (m_dev_fd != -1) shim_debug("Device node fd leaked!! fd=%d", m_dev_fd); +} + +std::string pdev::get_subdev_path(const std::string& subdev, uint idx) const { + // Main devfs path + if (subdev.empty()) { + std::string instStr = std::to_string(m_instance); + std::string prefixStr = "/dev/"; + prefixStr += m_driver->dev_node_dir() + "/" + m_driver->dev_node_prefix(); + return prefixStr + instStr; + } + + llvm::report_fatal_error("subdev path not supported"); +} + +int pdev::open(const std::string& subdev, uint32_t idx, int flag) const { + if (m_is_mgmt && !::is_admin()) + llvm::report_fatal_error("Root privileges required"); + + std::string devfs = get_subdev_path(subdev, idx); + return ::open(devfs.c_str(), flag); +} + +int pdev::open(const std::string& subdev, int flag) const { + return open(subdev, 0, flag); +} + +void pdev::open() const { + int fd; + const std::lock_guard lock(m_lock); + + if (m_dev_users == 0) { + fd = pdev::open("", O_RDWR); + if (fd < 0) + shim_err(EINVAL, "Failed to open KMQ device"); + else + shim_debug("Device opened, fd=%d", fd); + // Publish the fd for other threads to use. + m_dev_fd = fd; + } + ++m_dev_users; +} + +void pdev::close() const { + int fd; + const std::lock_guard lock(m_lock); + + --m_dev_users; + if (m_dev_users == 0) { + on_last_close(); + + // Stop new users of the fd from other threads. + fd = m_dev_fd; + m_dev_fd = -1; + // Kernel will wait for existing users to quit. + ::close(fd); + shim_debug("Device closed, fd=%d", fd); + } +} + +void pdev::ioctl(unsigned long cmd, void* arg) const { + if (::ioctl(m_dev_fd, cmd, arg) == -1) + shim_err(errno, "%s IOCTL failed", ioctl_cmd2name(cmd).c_str()); +} + +void* pdev::mmap(void* addr, size_t len, int prot, int flags, + off_t offset) const { + void* ret = ::mmap(addr, len, prot, flags, m_dev_fd, offset); + + if (ret == reinterpret_cast(-1)) + shim_err(errno, + "mmap(addr=%p, len=%ld, prot=%d, flags=%d, offset=%ld) failed", + addr, len, prot, flags, offset); + return ret; +} + +void pdev::munmap(void* addr, size_t len) const { ::munmap(addr, len); } + +std::shared_ptr pdev::create_device(device::handle_type handle, + device::id_type id) const { + auto dev = std::make_shared(*this, handle, id); + // Alloc device memory on first device creation. + // No locking is needed since driver will ensure only one heap BO is + // created. + if (m_dev_heap_bo == nullptr) + m_dev_heap_bo = + std::make_unique(*dev, dev_mem_size, AMDXDNA_BO_DEV_HEAP); + return dev; +} + +void pdev::on_last_close() const { m_dev_heap_bo.reset(); } + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h new file mode 100644 index 000000000..a2cb858fe --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef PCIDEV_XDNA_H +#define PCIDEV_XDNA_H + +#include + +#include + +#include "bo.h" +#include "device.h" +#include "pcidev.h" +#include "shim_debug.h" + +namespace shim_xdna { + +#define INVALID_ID 0xffff + +struct drv; + +struct pdev { + pdev(std::shared_ptr driver, std::string sysfs_name); + ~pdev(); + + void sysfs_get(const std::string& subdev, const std::string& entry, + std::string& err, std::vector& iv) const; + + template + void sysfs_get(const std::string& subdev, const std::string& entry, + std::string& err, T& i, const T& default_val) { + std::vector iv; + sysfs_get(subdev, entry, err, iv); + if (!iv.empty()) + i = static_cast(iv[0]); + else + i = static_cast(default_val); // default value + } + + std::string get_subdev_path(const std::string& subdev, uint32_t idx) const; + + std::shared_ptr create_device(device::handle_type handle, + device::id_type id) const; + + void ioctl(unsigned long cmd, void* arg) const; + + void* mmap(void* addr, size_t len, int prot, int flags, off_t offset) const; + + void munmap(void* addr, size_t len) const; + + int open(const std::string& subdev, uint32_t idx, int flag) const; + int open(const std::string& subdev, int flag) const; + + void open() const; + + void close() const; + + void on_last_close() const; + int map_usr_bar() const; + + // Virtual address of memory mapped BAR0, mapped on first use, once mapped, + // never change. + mutable char* m_user_bar_map = reinterpret_cast(MAP_FAILED); + + std::shared_ptr m_driver; + mutable int m_dev_fd = -1; + mutable int m_dev_users = 0; + mutable std::mutex m_lock; + uint16_t m_domain = INVALID_ID; + uint16_t m_bus = INVALID_ID; + uint16_t m_dev = INVALID_ID; + uint16_t m_func = INVALID_ID; + uint32_t m_instance = INVALID_ID; + std::string m_sysfs_name; // dir name under /sys/bus/pci/devices + int m_user_bar = 0; // BAR mapped in by tools, default is BAR0 + size_t m_user_bar_size = 0; + bool m_is_mgmt = false; + bool m_is_ready = false; + + // Create on first device creation and removed right before device is closed + mutable std::unique_ptr m_dev_heap_bo; +}; + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp new file mode 100644 index 000000000..5841e1bb3 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. +// +#include "pcidrv.h" + +#include + +#include "amdxdna_accel.h" +#include "pcidev.h" + +namespace { + +amdxdna_device_type get_dev_type(const std::string& sysfs) { + const std::string sysfs_root{"/sys/bus/pci/devices/"}; + const std::string dev_type_path = sysfs_root + sysfs + "/device_type"; + + std::ifstream ifs(dev_type_path); + if (!ifs.is_open()) + llvm::report_fatal_error(llvm::Twine(dev_type_path) + " is missing?"); + + std::string line; + std::getline(ifs, line); + return static_cast(std::stoi(line)); +} + +} // namespace + +namespace shim_xdna { + +std::string drv::name() const { return "amdxdna"; } + +std::string drv::dev_node_prefix() const { return "accel"; } + +std::string drv::dev_node_dir() const { return "accel"; } + +std::string drv::sysfs_dev_node_dir() const { return "accel"; } + +bool drv::is_user() const { return true; } + +std::shared_ptr drv::create_pcidev(const std::string& sysfs) const { + auto t = get_dev_type(sysfs); + auto driver = std::static_pointer_cast(shared_from_this()); + if (t == AMDXDNA_DEV_TYPE_KMQ) return std::make_shared(driver, sysfs); + // if (t == AMDXDNA_DEV_TYPE_UMQ) + // return std::make_shared(driver, sysfs); + shim_err(-EINVAL, "Unknown device type: %d", t); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h new file mode 100644 index 000000000..95cf8757e --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _PCIDRV_XDNA_H_ +#define _PCIDRV_XDNA_H_ + +#include + +#include "pcidev.h" + +namespace shim_xdna { + +struct drv : std::enable_shared_from_this { + std::string name() const; + bool is_user() const; + std::string dev_node_prefix() const; + std::string dev_node_dir() const; + std::string sysfs_dev_node_dir() const; + std::shared_ptr create_pcidev(const std::string& sysfs) const; +}; + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h new file mode 100644 index 000000000..ff026880d --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _SHARED_XDNA_H_ +#define _SHARED_XDNA_H_ + +#include + +namespace shim_xdna { + +struct shared_handle { + shared_handle(int fd) : m_fd(fd) {} + ~shared_handle() { + if (m_fd != -1) close(m_fd); + } + using export_handle = int; + export_handle get_export_handle() const { return m_fd; } + + const int m_fd; +}; + +} // namespace shim_xdna + +#endif // _SHARED_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp new file mode 100644 index 000000000..f57848458 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp @@ -0,0 +1,35 @@ +// +// Created by mlevental on 10/2/24. +// + +#include +#include +#include + +static std::recursive_mutex s_debug_mutex; + +namespace { +struct debug_lock { + std::lock_guard m_lk; + debug_lock(); +}; + +debug_lock::debug_lock() : m_lk(s_debug_mutex) {} + +unsigned long time_ns() { + static auto zero = std::chrono::high_resolution_clock::now(); + auto now = std::chrono::high_resolution_clock::now(); + auto integral_duration = + std::chrono::duration_cast(now - zero).count(); + return static_cast(integral_duration); +} + +void debugf(const char* format, ...) { + debug_lock lk; + va_list args; + va_start(args, format); + printf("%lu: ", time_ns()); + vprintf(format, args); + va_end(args); +} +} // namespace \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h new file mode 100644 index 000000000..fbc88dc09 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef SHIM_DEBUG_H +#define SHIM_DEBUG_H + +#include + +#include +#include +#include +#include + +#include "llvm/Support/Error.h" + +namespace { + +void debugf(const char* format, ...); + +#define XRT_PRINTF(format, ...) debugf(format, ##__VA_ARGS__) // NOLINT + +template +[[noreturn]] void shim_err(int, const char* fmt, Args&&... args) { + std::string format = std::string(fmt); + format += " (err=%d)"; + int sz = std::snprintf(nullptr, 0, "%s", format.c_str(), args...) + 1; + if (sz <= 0) llvm::report_fatal_error("could not format error string"); + + auto size = static_cast(sz); + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, "%s", format.c_str(), args...); + llvm::report_fatal_error(buf.get()); +} + +[[noreturn]] inline void shim_not_supported_err(const char* msg) { + shim_err(0, msg); +} + +template +void shim_debug(const char* fmt, Args&&... args) { + std::string format = "PID(%d): "; + format += std::string(fmt); + format += "\n"; + XRT_PRINTF(format.c_str(), getpid(), std::forward(args)...); +} + +template +void shim_info(const char* fmt, Args&&... args) { + std::string format = "PID(%d): "; + format += std::string(fmt); + format += "\n"; + XRT_PRINTF(format.c_str(), getpid(), std::forward(args)...); +} + +} // namespace + +#endif // SHIM_DEBUG_H From be04318c88faf865b6760bb006d17ab91e134877 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Wed, 2 Oct 2024 20:16:53 -0400 Subject: [PATCH 03/35] compiles --- .github/workflows/ci-linux.yml | 10 +- .github/workflows/ci-windows.yml | 10 +- build_tools/build_test_cpp.ps1 | 10 +- build_tools/build_test_cpp.sh | 111 ++++++++++-------- build_tools/download_peano.ps1 | 5 +- .../xrt-lite/shim/linux/kmq/amdxdna_accel.h | 10 +- .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 17 +-- .../driver/xrt-lite/shim/linux/kmq/bo.h | 15 ++- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 19 ++- .../driver/xrt-lite/shim/linux/kmq/device.h | 31 ++--- .../driver/xrt-lite/shim/linux/kmq/ert.h | 2 +- .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 16 +-- .../driver/xrt-lite/shim/linux/kmq/fence.h | 8 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 44 ++----- .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 13 +- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 4 +- .../driver/xrt-lite/shim/linux/kmq/hwq.h | 13 +- .../driver/xrt-lite/shim/linux/kmq/pcidev.cpp | 5 +- .../driver/xrt-lite/shim/linux/kmq/pcidev.h | 3 +- .../xrt-lite/shim/linux/kmq/shim_debug.cpp | 4 +- .../xrt-lite/shim/linux/kmq/shim_debug.h | 5 +- 21 files changed, 184 insertions(+), 171 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 9acdcf2ec..cc647669b 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -49,7 +49,10 @@ jobs: git remote add origin $REPO_ADDRESS git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."src/runtime_src/core/common/aiebu".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 - name: Install deps run: | @@ -62,6 +65,11 @@ jobs: pip install -r third_party/iree/runtime/bindings/python/iree/runtime/build_requirements.txt pip install pyyaml + - name: Peano dep + run: | + bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + - name: Enable cache uses: actions/cache/restore@v3 with: diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 48777f7f5..2d7a6e86e 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -60,7 +60,10 @@ jobs: git remote add origin $REPO_ADDRESS git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."src/runtime_src/core/common/aiebu".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 - name: Setup Cpp uses: aminya/setup-cpp@v1 @@ -87,6 +90,11 @@ jobs: key: ${{ env.CACHE_KEY }} restore-keys: windows-build-test-cpp- + - name: Peano dep + run: | + .\build_tools\download_peano.ps1 + Add-Content -Path $env:GITHUB_ENV -Value "PEANO_INSTALL_DIR=$PWD\llvm-aie" + - name: Build packages run: | $env:cache_dir = "${{ env.CACHE_DIR }}" diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index f508ce5ff..9d214a5db 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -82,8 +82,16 @@ $CMAKE_ARGS = @( "-DIREE_CMAKE_PLUGIN_PATHS=$repo_root" "-DIREE_EXTERNAL_HAL_DRIVERS=xrt" "-DIREE_BUILD_PYTHON_BINDINGS=ON" + # iree/runtime/src/iree/hal/cts/cts_test_base.h:173:24: error: unused variable 'device_buffer' [-Werror,-Wunused-variable] + "-DIREE_ENABLE_WERROR_FLAG=OFF" ) +$peano_install_dir = "$env:PEANO_INSTALL_DIR" +if ($peano_install_dir -and (Test-Path "$peano_install_dir")) +{ + $CMAKE_ARGS += @("-DPEANO_INSTALL_DIR=$peano_install_dir") +} + if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) { echo "using existing llvm install @ $llvm_install_dir" @@ -116,7 +124,7 @@ echo "-----" # better have git-bash installed... $env:Path = "C:\Program Files\Git\bin;$env:Path" pushd $build_dir -& bash -l -c "ctest -R amd-aie --output-on-failure -j --repeat until-pass:5" +& bash -l -c "ctest -R amd-aie -E driver/xrt-lite --output-on-failure -j --repeat until-pass:5" popd if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index d04bf8bb4..857c81f44 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -31,7 +31,7 @@ mkdir -p "${cache_dir}/pip" python="$(which python)" echo "Using python: $python" -if [[ "$OSTYPE" == "linux-gnu"* ]]; then +if [[ "$OSTYPE" == "linux"* ]]; then export CMAKE_TOOLCHAIN_FILE="$this_dir/linux_default_toolchain.cmake" export CC=clang export CXX=clang++ @@ -61,54 +61,73 @@ echo '{ }' > $iree_dir/CMakeUserPresets.json cd $iree_dir -CMAKE_ARGS="\ - -GNinja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_dir \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DIREE_ERROR_ON_MISSING_SUBMODULES=OFF \ - -DIREE_ENABLE_ASSERTIONS=ON \ - -DIREE_BUILD_SAMPLES=OFF \ - -DIREE_BUILD_PYTHON_BINDINGS=ON \ - -DIREE_BUILD_BINDINGS_TFLITE=OFF \ - -DIREE_HAL_DRIVER_DEFAULTS=OFF \ - -DIREE_HAL_DRIVER_LOCAL_SYNC=ON \ - -DIREE_HAL_DRIVER_LOCAL_TASK=ON \ - -DIREE_TARGET_BACKEND_DEFAULTS=OFF \ - -DIREE_TARGET_BACKEND_LLVM_CPU=ON \ - -DIREE_INPUT_TOSA=OFF \ - -DIREE_INPUT_STABLEHLO=OFF \ - -DIREE_INPUT_TORCH=OFF \ - -DCMAKE_OBJECT_PATH_MAX=4096 \ - -DIREE_CMAKE_PLUGIN_PATHS=$repo_root" - -if [ -d "${llvm_install_dir}" ]; then - CMAKE_ARGS="$CMAKE_ARGS \ - -DIREE_BUILD_BUNDLED_LLVM=OFF \ - -DClang_DIR=$llvm_install_dir/lib/cmake/clang \ - -DLLD_DIR=$llvm_install_dir/lib/cmake/lld \ - -DMLIR_DIR=$llvm_install_dir/lib/cmake/mlir \ - -DLLVM_DIR=$llvm_install_dir/lib/cmake/llvm" +CMAKE_ARGS=( + -GNinja + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX="$install_dir" + -DCMAKE_INSTALL_LIBDIR=lib + -DIREE_ERROR_ON_MISSING_SUBMODULES=OFF + -DIREE_ENABLE_ASSERTIONS=ON + -DIREE_BUILD_SAMPLES=OFF + -DIREE_BUILD_PYTHON_BINDINGS=ON + -DIREE_BUILD_BINDINGS_TFLITE=OFF + -DIREE_HAL_DRIVER_DEFAULTS=OFF + -DIREE_HAL_DRIVER_LOCAL_SYNC=ON + -DIREE_HAL_DRIVER_LOCAL_TASK=ON + -DIREE_TARGET_BACKEND_DEFAULTS=OFF + -DIREE_TARGET_BACKEND_LLVM_CPU=ON + -DIREE_INPUT_TOSA=OFF + -DIREE_INPUT_STABLEHLO=OFF + -DIREE_INPUT_TORCH=OFF + -DCMAKE_OBJECT_PATH_MAX=4096 + -DIREE_CMAKE_PLUGIN_PATHS="$repo_root" + # iree/runtime/src/iree/hal/cts/cts_test_base.h:173:24: error: unused variable 'device_buffer' [-Werror,-Wunused-variable] + -DIREE_ENABLE_WERROR_FLAG=OFF +) + +PEANO_INSTALL_DIR=${PEANO_INSTALL_DIR:-""} +if [ "$PEANO_INSTALL_DIR" != "" ] && [ -d "$PEANO_INSTALL_DIR" ]; then + CMAKE_ARGS+=(-DPEANO_INSTALL_DIR="$PEANO_INSTALL_DIR") fi -if [[ "$OSTYPE" == "linux-gnu"* ]]; then - cmake $CMAKE_ARGS \ - -DCMAKE_EXE_LINKER_FLAGS_INIT="-fuse-ld=lld" \ - -DCMAKE_SHARED_LINKER_FLAGS_INIT="-fuse-ld=lld" \ - -DCMAKE_MODULE_LINKER_FLAGS_INIT="-fuse-ld=lld" \ - -DCMAKE_C_COMPILER="${CC}" \ - -DCMAKE_CXX_COMPILER="${CXX}" \ - -DLLVM_TARGET_ARCH=X86 \ - -DLLVM_TARGETS_TO_BUILD=X86 \ - -DIREE_EXTERNAL_HAL_DRIVERS=xrt \ - -S $iree_dir -B $build_dir +if [ -d "$llvm_install_dir" ]; then + CMAKE_ARGS+=( + -DIREE_BUILD_BUNDLED_LLVM=OFF + -DClang_DIR="$llvm_install_dir/lib/cmake/clang" + -DLLD_DIR="$llvm_install_dir/lib/cmake/lld" + -DMLIR_DIR="$llvm_install_dir/lib/cmake/mlir" + -DLLVM_DIR="$llvm_install_dir/lib/cmake/llvm" + ) +fi + +if [[ "$OSTYPE" == "linux"* ]]; then + CMAKE_ARGS+=( + -DCMAKE_EXE_LINKER_FLAGS_INIT="-fuse-ld=lld" + -DCMAKE_SHARED_LINKER_FLAGS_INIT="-fuse-ld=lld" + -DCMAKE_MODULE_LINKER_FLAGS_INIT="-fuse-ld=lld" + -DCMAKE_C_COMPILER="${CC}" + -DCMAKE_CXX_COMPILER="${CXX}" + -DLLVM_TARGET_ARCH=X86 + -DLLVM_TARGETS_TO_BUILD=X86 + -DIREE_EXTERNAL_HAL_DRIVERS="xrt;xrt-lite" + -S + "$iree_dir" + -B + "$build_dir" + ) elif [[ "$OSTYPE" == "darwin"* ]]; then - cmake $CMAKE_ARGS \ - -DLLVM_TARGET_ARCH="X86;ARM" \ - -DLLVM_TARGETS_TO_BUILD="X86;ARM" \ - -S $iree_dir -B $build_dir + CMAKE_ARGS+=( + -DLLVM_TARGET_ARCH="X86;ARM" + -DLLVM_TARGETS_TO_BUILD="X86;ARM" + -S + "$iree_dir" + -B + "$build_dir" + ) fi +cmake "${CMAKE_ARGS[@]}" + echo "Building all" echo "------------" cmake --build "$build_dir" -- -k 0 @@ -123,8 +142,8 @@ cmake --build "$build_dir" --target iree-install-dist echo "CTest" echo "-----" -if [[ "$OSTYPE" == "linux-gnu"* ]]; then - ctest --test-dir "$build_dir" -R amd-aie --output-on-failure -j +if [[ "$OSTYPE" == "linux"* ]]; then + ctest --test-dir "$build_dir" -R amd-aie -E "driver/xrt-lite" --output-on-failure -j elif [[ "$OSTYPE" == "darwin"* ]]; then ctest --test-dir "$build_dir" -R amd-aie -E "matmul_pack_peel_air_e2e|matmul_elementwise_pack_peel_air_e2e|conv_fill_spec_pad" --output-on-failure -j --repeat until-pass:5 fi diff --git a/build_tools/download_peano.ps1 b/build_tools/download_peano.ps1 index 89bd6808f..6589cc562 100644 --- a/build_tools/download_peano.ps1 +++ b/build_tools/download_peano.ps1 @@ -9,4 +9,7 @@ $ErrorActionPreference = 'Stop' $this_dir = Split-Path -Path $MyInvocation.MyCommand.Path -Parent $RELEASE = (Get-Content -Path "$this_dir/peano_commit.txt") pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly -Expand-Archive (Get-ChildItem -Filter llvm*.whl).FullName -DestinationPath $PWD.Path +$peano = (Get-ChildItem -Filter llvm*.whl) +$new_name = ($peano.Basename + ".zip") +Rename-Item -Path $peano.Name -NewName $new_name +Expand-Archive $new_name -DestinationPath $PWD.Path diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h index e7f52afc3..d4d18d6b3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h @@ -132,11 +132,11 @@ struct amdxdna_cu_config { * @pad: MBZ * @cu_configs: Array of CU configurations of struct amdxdna_cu_config */ -struct amdxdna_hwctx_param_config_cu { - __u16 num_cus; - __u16 pad[3]; - struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); -}; +// struct amdxdna_hwctx_param_config_cu { +// __u16 num_cus; +// __u16 pad[3]; +// struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); +// }; enum amdxdna_drm_config_hwctx_param { DRM_AMDXDNA_HWCTX_CONFIG_CU, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index 3076a386a..739456ccf 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -9,6 +9,7 @@ #include #include +#include "pcidev.h" #include "shim_debug.h" namespace { @@ -16,7 +17,7 @@ namespace { uint32_t alloc_drm_bo(const shim_xdna::pdev& dev, amdxdna_bo_type type, void* buf, size_t size) { amdxdna_drm_create_bo cbo = { - .type = type, + .type = static_cast(type), .vaddr = reinterpret_cast(buf), .size = size, }; @@ -38,7 +39,7 @@ void get_drm_bo_info(const shim_xdna::pdev& dev, uint32_t boh, void* map_parent_range(size_t size) { auto p = ::mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!p) shim_err(errno, "mmap(len=%ld) failed", size); + if (!p) shim_xdna::shim_err(errno, "mmap(len=%ld) failed", size); return p; } @@ -101,7 +102,7 @@ bool is_power_of_two(size_t x) { return (x > 0) && ((x & (x - 1)) == 0); } void* addr_align(void* p, size_t align) { if (!is_power_of_two(align)) - shim_err(EINVAL, "Alignment 0x%lx is not power of two", align); + shim_xdna::shim_err(EINVAL, "Alignment 0x%lx is not power of two", align); return (void*)(((uintptr_t)p + align) & ~(align - 1)); } @@ -129,7 +130,8 @@ inline void clflush_data(const void* base, size_t offset, size_t len) { if (!cacheline_size) { long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - if (sz <= 0) shim_err(EINVAL, "Invalid cache line size: %ld", sz); + if (sz <= 0) + shim_xdna::shim_err(EINVAL, "Invalid cache line size: %ld", sz); cacheline_size = sz; } @@ -313,8 +315,7 @@ std::unique_ptr bo::share() const { amdxdna_bo_type bo::get_type() const { return m_type; } -bo::bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, - uint64_t flags) +bo::bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags) : bo(device, ctx_id, size, flags, flag_to_type(flags)) { if (m_type == AMDXDNA_BO_INVALID) shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); @@ -323,8 +324,8 @@ bo::bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, bo::bo(const device& device, size_t size, amdxdna_bo_type type) : bo(device, AMDXDNA_INVALID_CTX_HANDLE, size, 0, type) {} -bo::bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, - uint64_t flags, amdxdna_bo_type type) +bo::bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags, + amdxdna_bo_type type) : m_pdev(device.get_pdev()), m_aligned_size(size), m_flags(flags), diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h index 2f513ae8d..ea163db45 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -5,12 +5,11 @@ #define _BO_XDNA_H_ #include +#include +#include #include #include "amdxdna_accel.h" -#include "device.h" -#include "hwctx.h" -#include "pcidev.h" #include "shared.h" #include "shim_debug.h" @@ -62,6 +61,9 @@ struct xcl_bo_flags { }; }; +struct device; +struct pdev; + struct bo { // map_type - determines how a buffer is mapped enum class map_type { read, write }; @@ -87,10 +89,11 @@ struct bo { uint64_t kmhdl; // kernel mode handle }; - bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, uint64_t flags, + using uint32_t = uint32_t; + bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags, amdxdna_bo_type type); - bo(const device& device, hw_ctx::slot_id ctx_id, size_t size, uint64_t flags); + bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags); bo(const device& device, shared_handle::export_handle ehdl); @@ -171,7 +174,7 @@ struct bo { // Used when exclusively assigned to a HW context. By default, BO is shared // among all HW contexts. - hw_ctx::slot_id m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; + uint32_t m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; void bind_at(size_t pos, const bo* bh, size_t offset, size_t size); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 20ec767ea..69676385c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -3,18 +3,15 @@ #include "device.h" -#include -#include - -#include #include #include "bo.h" #include "hwctx.h" +#include "pcidev.h" namespace shim_xdna { -device::device(const pdev& pdev, handle_type shim_handle) +device::device(const pdev& pdev, void* shim_handle) : m_pdev(pdev), m_handle(shim_handle) { shim_debug("Created KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str()); } @@ -24,16 +21,16 @@ device::~device() { m_pdev.close(); } -std::unique_ptr device::create_hw_context( - const device& dev, const hw_ctx::qos_type& qos) const { - return std::make_unique(dev, qos); -} +// std::unique_ptr device::create_hw_context( +// const device& dev, const hw_ctx::qos_type& qos) const { +// return std::make_unique(dev, qos); +// } -std::unique_ptr device::alloc_bo(void* userptr, hw_ctx::slot_id ctx_id, +std::unique_ptr device::alloc_bo(void* userptr, uint32_t ctx_id, size_t size, uint64_t flags) { if (userptr) shim_not_supported_err("User ptr BO"); - auto b = bo(this->m_pdev, ctx_id, size, flags); + auto b = bo(*this, ctx_id, size, flags); return std::make_unique(*this, ctx_id, size, flags); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index 8db0f2227..ca4f295f8 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -5,13 +5,13 @@ #define PCIE_DEVICE_LINUX_XDNA_H #include +#include -#include "pcidev.h" +#include "shared.h" #include "shim_debug.h" namespace shim_xdna { -typedef void* xclDeviceHandle; #define XRT_NULL_HANDLE NULL // cuidx_type - encode cuidx and domain @@ -36,21 +36,22 @@ struct cuidx_type { using domain_index_type = uint16_t; }; -struct device { - // device index type - using id_type = unsigned int; - using slot_id = uint32_t; - using handle_type = xclDeviceHandle; +struct hw_ctx; +struct pdev; +struct bo; - device(const pdev& pdev, handle_type shim_handle); +struct device { + device(const pdev& pdev, void* shim_handle); ~device(); - std::unique_ptr alloc_bo(void* userptr, hw_ctx::slot_id ctx_id, - size_t size, uint64_t flags); + using qos_type = std::map; + enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; + std::unique_ptr alloc_bo(void* userptr, uint32_t ctx_id, size_t size, + uint64_t flags); - std::unique_ptr create_hw_context(const device& dev, - const hw_ctx::qos_type& qos) const; + // std::unique_ptr create_hw_context(const device& dev, + // const qos_type& qos) const; std::unique_ptr import_bo(shared_handle::export_handle ehdl) const; @@ -60,8 +61,8 @@ struct device { std::unique_ptr import_bo(pid_t, shared_handle::export_handle); - std::unique_ptr create_hw_context(const hw_ctx::qos_type& qos, - hw_ctx::access_mode mode) const; + std::unique_ptr create_hw_context(const qos_type& qos, + access_mode mode) const; std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, uint32_t size); @@ -76,7 +77,7 @@ struct device { const pdev& m_pdev; // The pcidev that this device object is derived from std::map m_bo_map; - xclDeviceHandle m_handle = XRT_NULL_HANDLE; + void* m_handle = XRT_NULL_HANDLE; mutable std::mutex m_mutex; }; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h index ac5858db4..bce5d1623 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h @@ -447,7 +447,7 @@ struct config_sk_image_uuid { uint32_t num_cus; uint32_t sk_name[5]; unsigned char sk_uuid[16]; - uint32_t slot_id; + uint32_t uint32_t; }; /** diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp index 4af239eb1..850e4198a 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -6,6 +6,9 @@ #include #include "amdxdna_accel.h" +#include "fence.h" +#include "hwctx.h" +#include "pcidev.h" namespace { @@ -138,15 +141,6 @@ fence::fence(const device& device, shared_handle::export_handle ehdl) shim_debug("Fence imported: %d@%ld", m_syncobj_hdl, m_state); } -fence::fence(const fence& f) - : m_pdev(f.m_pdev), - m_import(f.share()), - m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())), - m_state{f.m_state}, - m_signaled{f.m_signaled} { - shim_debug("Fence cloned: %d@%ld", m_syncobj_hdl, m_state); -} - fence::~fence() { shim_debug("Fence going away: %d@%ld", m_syncobj_hdl, m_state); destroy_syncobj(m_pdev, m_syncobj_hdl); @@ -161,10 +155,6 @@ std::unique_ptr fence::share() const { uint64_t fence::get_next_state() const { return m_state + 1; } -std::unique_ptr fence::clone() const { - return std::make_unique(*this); -} - uint64_t fence::wait_next_state() const { std::lock_guard guard(m_lock); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h index 2b9067c1c..a5acd4c1d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h @@ -6,13 +6,15 @@ #include -#include "device.h" -#include "hwctx.h" #include "shared.h" #include "shim_debug.h" namespace shim_xdna { +struct device; +struct hw_ctx; +struct pdev; + struct fence { using export_handle = shared_handle::export_handle; enum class access_mode : uint8_t { local, shared, process, hybrid }; @@ -23,8 +25,6 @@ struct fence { ~fence(); - std::unique_ptr clone() const; - std::unique_ptr share() const; void wait(uint32_t timeout_ms) const; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index a10abf3ae..23dd3b728 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -4,30 +4,16 @@ #include "hwctx.h" #include "bo.h" +#include "device.h" #include "hwq.h" +#include "pcidev.h" -namespace { - -std::vector get_pdi(const xrt_core::xclbin::aie_partition_obj& aie, - uint16_t kernel_id) { - for (auto& pdi : aie.pdis) { - for (auto& cdo : pdi.cdo_groups) { - for (auto kid : cdo.kernel_ids) { - if (kid == kernel_id) return pdi.pdi; - } - } - } - shim_err(ENOENT, "PDI for kernel ID 0x%x not found", kernel_id); -} - -} // namespace namespace shim_xdna { hw_ctx::hw_ctx(const device& dev, const qos_type& qos, std::unique_ptr q) : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { shim_debug("Creating HW context..."); init_qos_info(qos); - parse_xclbin(xclbin); } hw_ctx::~hw_ctx() { @@ -35,9 +21,9 @@ hw_ctx::~hw_ctx() { shim_debug("Destroyed HW context (%d)...", m_handle); } -hw_ctx::slot_id hw_ctx::get_slotidx() const { return m_handle; } +uint32_t hw_ctx::get_slotidx() const { return m_handle; } -void hw_ctx::set_slotidx(slot_id id) { m_handle = id; } +void hw_ctx::set_slotidx(uint32_t id) { m_handle = id; } cuidx_type hw_ctx::open_cu_context(const std::string& cu_name) { for (uint32_t i = 0; i < m_cu_info.size(); i++) { @@ -82,20 +68,6 @@ void hw_ctx::init_qos_info(const qos_type& qos) { } } -void hw_ctx::print_xclbin_info() { - if (m_cu_info.empty()) { - shim_debug("CU INFO is empty"); - return; - } - - for (int idx = 0; idx < m_cu_info.size(); idx++) { - auto& e = m_cu_info[idx]; - shim_debug("index=%d, name=%s, func=%d, pdi(p=%p, sz=%ld)", idx, - e.m_name.c_str(), e.m_func, e.m_pdi.data(), e.m_pdi.size()); - } - shim_debug("OPs/cycle: %d", m_ops_per_cycle); -} - const device& hw_ctx::get_device() { return m_device; } const std::vector& hw_ctx::get_cu_info() const { @@ -107,10 +79,10 @@ void hw_ctx::create_ctx_on_device() { arg.qos_p = reinterpret_cast(&m_qos); arg.umq_bo = m_q->get_queue_bo(); arg.max_opc = m_ops_per_cycle; - arg.num_tiles = - m_num_cols * - xrt_core::device_query(&m_device) - .core_rows; + // arg.num_tiles = + // m_num_cols * + // xrt_core::device_query(&m_device) + // .core_rows; arg.log_buf_bo = m_log_bo ? static_cast(m_log_bo.get())->get_drm_bo_handle() : AMDXDNA_INVALID_BO_HANDLE; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index 446d64f58..9fec8cc6f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -7,18 +7,19 @@ #include #include "amdxdna_accel.h" -#include "device.h" +#include "shared.h" #include "shim_debug.h" namespace shim_xdna { struct hw_q; // forward declaration +struct device; +struct bo; +struct cuidx_type; struct hw_ctx { using qos_type = std::map; enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; - using access_mode = access_mode; - using slot_id = uint32_t; hw_ctx(const device& dev, const qos_type& qos, std::unique_ptr q); @@ -29,7 +30,7 @@ struct hw_ctx { void update_access_mode(access_mode) { shim_not_supported_err(__func__); } - slot_id get_slotidx() const; + uint32_t get_slotidx() const; hw_q* get_hw_queue(); @@ -57,7 +58,7 @@ struct hw_ctx { const std::vector& get_cu_info() const; - void set_slotidx(slot_id id); + void set_slotidx(uint32_t id); void set_doorbell(uint32_t db); @@ -68,7 +69,7 @@ struct hw_ctx { void fini_log_buf(); const device& m_device; - slot_id m_handle = AMDXDNA_INVALID_CTX_HANDLE; + uint32_t m_handle = AMDXDNA_INVALID_CTX_HANDLE; amdxdna_qos_info m_qos = {}; std::vector m_cu_info; std::unique_ptr m_q; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp index d52c2a8ac..25bc89ad0 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -6,6 +6,8 @@ #include "bo.h" #include "ert.h" #include "fence.h" +#include "hwctx.h" +#include "pcidev.h" #include "shim_debug.h" namespace { @@ -22,7 +24,7 @@ int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, auto boh = static_cast(cmd); auto id = boh->get_cmd_id(); - shim_debug("Waiting for cmd (%ld)...", id); + shim_xdna::shim_debug("Waiting for cmd (%ld)...", id); amdxdna_drm_wait_cmd wcmd = { .hwctx = ctx->get_slotidx(), diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h index 30b3cfbf9..98442c49c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h @@ -3,12 +3,15 @@ #ifndef _HWQ_XDNA_H_ #define _HWQ_XDNA_H_ - -#include "fence.h" -#include "hwctx.h" -#include "shim_debug.h" +#include +#include namespace shim_xdna { +struct device; +struct bo; +struct hw_ctx; +struct pdev; +struct fence; struct hw_q { hw_q(const device &device); @@ -25,7 +28,7 @@ struct hw_q { void submit_signal(const fence *); - virtual void bind_hwctx(const hw_ctx *ctx) = 0; + void bind_hwctx(const hw_ctx *ctx); void unbind_hwctx(); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp index e057c61ac..f078d3e00 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp @@ -427,9 +427,8 @@ void* pdev::mmap(void* addr, size_t len, int prot, int flags, void pdev::munmap(void* addr, size_t len) const { ::munmap(addr, len); } -std::shared_ptr pdev::create_device(device::handle_type handle, - device::id_type id) const { - auto dev = std::make_shared(*this, handle, id); +std::shared_ptr pdev::create_device(void* handle) const { + auto dev = std::make_shared(*this, handle); // Alloc device memory on first device creation. // No locking is needed since driver will ensure only one heap BO is // created. diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h index a2cb858fe..a84fa646c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h @@ -39,8 +39,7 @@ struct pdev { std::string get_subdev_path(const std::string& subdev, uint32_t idx) const; - std::shared_ptr create_device(device::handle_type handle, - device::id_type id) const; + std::shared_ptr create_device(void* handle) const; void ioctl(unsigned long cmd, void* arg) const; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp index f57848458..d761a3995 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp @@ -8,7 +8,7 @@ static std::recursive_mutex s_debug_mutex; -namespace { +namespace shim_xdna { struct debug_lock { std::lock_guard m_lk; debug_lock(); @@ -32,4 +32,4 @@ void debugf(const char* format, ...) { vprintf(format, args); va_end(args); } -} // namespace \ No newline at end of file +} // namespace shim_xdna \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h index fbc88dc09..e37dc2b55 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -9,11 +9,10 @@ #include #include #include -#include #include "llvm/Support/Error.h" -namespace { +namespace shim_xdna { void debugf(const char* format, ...); @@ -52,6 +51,6 @@ void shim_info(const char* fmt, Args&&... args) { XRT_PRINTF(format.c_str(), getpid(), std::forward(args)...); } -} // namespace +} // namespace shim_xdna #endif // SHIM_DEBUG_H From 488b3d22f5734bd205e5640ee6309044028cd19d Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 9 Oct 2024 15:40:32 -0400 Subject: [PATCH 04/35] start from scratch --- .../driver/xrt-lite/CMakeLists.txt | 13 +- .../src/iree-amd-aie/driver/xrt-lite/api.h | 30 +- .../driver/xrt-lite/cts/CMakeLists.txt | 130 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 34 + .../src/iree-amd-aie/driver/xrt-lite/device.h | 17 + .../iree-amd-aie/driver/xrt-lite/driver.cc | 189 ++- .../driver/xrt-lite/native_executable.cc | 297 ----- .../driver/xrt-lite/native_executable.h | 44 - .../xrt-lite/registration/CMakeLists.txt | 7 +- .../xrt-lite/registration/driver_module.c | 40 +- .../xrt-lite/registration/driver_module.h | 6 +- .../driver/xrt-lite/shim/CMakeLists.txt | 10 - .../driver/xrt-lite/shim/linux/CMakeLists.txt | 8 - .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 31 - .../xrt-lite/shim/linux/kmq/amdxdna_accel.h | 591 --------- .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 444 ------- .../driver/xrt-lite/shim/linux/kmq/bo.h | 194 --- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 116 -- .../driver/xrt-lite/shim/linux/kmq/device.h | 87 -- .../driver/xrt-lite/shim/linux/kmq/ert.h | 1176 ----------------- .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 221 ---- .../driver/xrt-lite/shim/linux/kmq/fence.h | 62 - .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 123 -- .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 89 -- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 90 -- .../driver/xrt-lite/shim/linux/kmq/hwq.h | 46 - .../driver/xrt-lite/shim/linux/kmq/pcidev.cpp | 443 ------- .../driver/xrt-lite/shim/linux/kmq/pcidev.h | 85 -- .../driver/xrt-lite/shim/linux/kmq/pcidrv.cpp | 49 - .../driver/xrt-lite/shim/linux/kmq/pcidrv.h | 24 - .../driver/xrt-lite/shim/linux/kmq/shared.h | 24 - .../xrt-lite/shim/linux/kmq/shim_debug.cpp | 35 - .../xrt-lite/shim/linux/kmq/shim_debug.h | 56 - 33 files changed, 254 insertions(+), 4557 deletions(-) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/device.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/device.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index 248412689..3c605d231 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -16,20 +16,25 @@ iree_register_external_hal_driver( iree_hal_xrt_lite_driver_module_register ) +find_package(ShimXDNA REQUIRED) + iree_cc_library( NAME xrt-lite SRCS api.h - native_executable.h - native_executable.cc + device.cc driver.cc util.h DEPS iree::base iree::base::core_headers iree::base::internal::flatcc::parsing - iree-amd-aie::schemas::pdi_executable_def_c_fbs - iree-amd-aie::driver::xrt-lite::shim::linux::kmq::kmq + iree-amd-aie::schemas::xrt_executable_def_c_fbs + xrt_driver_xdna + $ + COPTS + $<$:-fexceptions -frtti> + $<$:/EHsc /GR> PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index 8e3b00649..af257ac50 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -14,9 +14,35 @@ extern "C" { #endif // __cplusplus +// Must be initialized with iree_hal_xrt_lite_device_options_initialize prior to +// use. +struct iree_hal_xrt_lite_device_options_t { + // TODO(null): options for initializing a device such as hardware identifiers, + // implementation mode switches, and debugging control. +}; + +// Initializes |out_params| to default values. +IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( + struct iree_hal_xrt_lite_device_options_t* out_params); + +struct iree_hal_xrt_lite_driver_options_t { + // TODO(null): options for initializing the driver such as library search + // paths, version min/max, etc. + struct iree_hal_xrt_lite_device_options_t default_device_options; +}; + +IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( + struct iree_hal_xrt_lite_driver_options_t* out_options); + +// The provided |identifier| will be used by programs to distinguish the device +// type from other HAL implementations. If compiling programs with the IREE +// compiler this must match the value used by IREE::HAL::TargetDevice. +// +// |out_driver| must be released by the caller (see iree_hal_driver_release). IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( - iree_string_view_t identifier, iree_allocator_t host_allocator, - iree_hal_driver_t** out_driver); + iree_string_view_t identifier, + const struct iree_hal_xrt_lite_driver_options_t* options, + iree_allocator_t host_allocator, iree_hal_driver_t** out_driver); #ifdef __cplusplus } // extern "C" diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index e210f00a1..eacafbf82 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -6,54 +6,54 @@ include(CMakeDependentOption) -set(PEANO_INSTALL_DIR "" CACHE PATH "") -set(VITIS_DIR "" CACHE PATH "") -if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) - message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") -endif() -cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") -set(TARGET_DEVICE "npu1_4col" CACHE STRING "") - -iree_bytecode_module( - NAME - xrt_lite_executable_cache_test_module - MODULE_FILE_NAME - xrt_lite_executable_cache_test.bin - SRC - "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" - FLAGS - --compile-mode=hal-executable - --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} - --iree-hal-target-backends=amd-aie - --iree-amdaie-lower-to-aie-pipeline=air - --iree-amdaie-target-device=${TARGET_DEVICE} - --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} - --iree-amd-aie-vitis-install-dir=${VITIS_DIR} - --iree-amd-aie-enable-chess=$ - --iree-amd-aie-show-invoked-commands - PUBLIC - TESTONLY -) - -iree_c_embed_data( - NAME - xrt_lite_executables_c - SRCS - xrt_lite_executable_cache_test.bin - C_FILE_OUTPUT - xrt_lite_executables_c.c - H_FILE_OUTPUT - xrt_lite_executables_c.h - IDENTIFIER - iree_cts_testdata_executables_aie_xrt_lite - STRIP_PREFIX - xrt_lite_ - DEPENDS - ::xrt_lite_executable_cache_test_module - FLATTEN - PUBLIC - TESTONLY -) +#set(PEANO_INSTALL_DIR "" CACHE PATH "") +#set(VITIS_DIR "" CACHE PATH "") +#if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) +# message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") +#endif() +#cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") +#set(TARGET_DEVICE "npu1_4col" CACHE STRING "") +# +#iree_bytecode_module( +# NAME +# xrt_lite_executable_cache_test_module +# MODULE_FILE_NAME +# xrt_lite_executable_cache_test.bin +# SRC +# "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" +# FLAGS +# --compile-mode=hal-executable +# --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} +# --iree-hal-target-backends=amd-aie +# --iree-amdaie-lower-to-aie-pipeline=air +# --iree-amdaie-target-device=${TARGET_DEVICE} +# --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} +# --iree-amd-aie-vitis-install-dir=${VITIS_DIR} +# --iree-amd-aie-enable-chess=$ +# --iree-amd-aie-show-invoked-commands +# PUBLIC +# TESTONLY +#) +# +#iree_c_embed_data( +# NAME +# xrt_lite_executables_c +# SRCS +# xrt_lite_executable_cache_test.bin +# C_FILE_OUTPUT +# xrt_lite_executables_c.c +# H_FILE_OUTPUT +# xrt_lite_executables_c.h +# IDENTIFIER +# iree_cts_testdata_executables_aie_xrt_lite +# STRIP_PREFIX +# xrt_lite_ +# DEPENDS +# ::xrt_lite_executable_cache_test_module +# FLATTEN +# PUBLIC +# TESTONLY +#) iree_hal_cts_test_suite( DRIVER_NAME @@ -65,7 +65,7 @@ iree_hal_cts_test_suite( COMPILER_TARGET_BACKEND "amd-aie" EXECUTABLE_FORMAT - "\"amdaie-pdi-fb\"" + "\"amdaie-xclbin-fb\"" DEPS iree-amd-aie::driver::xrt-lite::registration INCLUDED_TESTS @@ -75,19 +75,19 @@ iree_hal_cts_test_suite( "driver" ) -iree_cc_test( - NAME - xrt_lite_command_buffer_dispatch_test - SRCS - xrt_lite_command_buffer_dispatch_test.cc - DEPS - ::xrt_lite_executables_c - iree-amd-aie::driver::xrt-lite::registration - iree::base - iree::hal - iree::hal::cts::cts_test_base - iree::testing::gtest_main - iree::tools::testing::e2e::e2e_test_util -) - -target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_command_buffer_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") \ No newline at end of file +#iree_cc_test( +# NAME +# xrt_lite_command_buffer_dispatch_test +# SRCS +# xrt_lite_command_buffer_dispatch_test.cc +# DEPS +# ::xrt_lite_executables_c +# iree-amd-aie::driver::xrt-lite::registration +# iree::base +# iree::hal +# iree::hal::cts::cts_test_base +# iree::testing::gtest_main +# iree::tools::testing::e2e::e2e_test_util +#) +# +#target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_command_buffer_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc new file mode 100644 index 000000000..4768c0cf7 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -0,0 +1,34 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/device.h" + +#include "iree-amd-aie/driver/xrt-lite/api.h" + +struct iree_hal_xrt_lite_device_t { + iree_hal_resource_t resource; + iree_string_view_t identifier; + + iree_allocator_t host_allocator; + iree_hal_allocator_t* device_allocator; +}; + +namespace { +extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; +} + +void iree_hal_xrt_lite_device_options_initialize( + iree_hal_xrt_lite_device_options_t* out_options) { + memset(out_options, 0, sizeof(*out_options)); + // TODO(null): set defaults based on compiler configuration. Flags should not + // be used as multiple devices may be configured within the process or the + // hosting application may be authored in python/etc that does not use a flags + // mechanism accessible here. +} + +namespace { +const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = {}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h new file mode 100644 index 000000000..c8d2a6e1f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h @@ -0,0 +1,17 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_DEVICE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_DEVICE_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +// NOTE: nothing in the skeleton implementation. Device creation and adoption is +// part of the public API header. This header can contain internal types and +// functions. + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_DEVICE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 1e0d06c12..1f1c489f2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -1,147 +1,112 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "iree/base/api.h" -#include "iree/base/tracing.h" -#include "iree/hal/api.h" +#include "iree-amd-aie/driver/xrt-lite/api.h" #include "util.h" -// Maximum device path length we support. The path is always a 16 character hex -// string. -#define IREE_HAL_XRT_LITE_MAX_DEVICE_PATH_LENGTH 32 -// Maximum device name length we support. -#define IREE_HAL_XRT_LITE_MAX_DEVICE_NAME_LENGTH 64 - -struct iree_hal_xrt_lite_driver_t { - // Abstract resource used for injecting reference counting and vtable; must be - // at offset 0. +typedef struct iree_hal_xrt_lite_driver_t { iree_hal_resource_t resource; iree_allocator_t host_allocator; iree_string_view_t identifier; - uint64_t device_hdl; -}; + iree_hal_xrt_lite_driver_options_t options; + // + trailing identifier string storage +} iree_hal_xrt_lite_driver_t; -static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { - iree_hal_xrt_lite_driver_t* driver = - reinterpret_cast(base_driver); - iree_allocator_t host_allocator = driver->host_allocator; - IREE_TRACE_ZONE_BEGIN(z0); +namespace { +extern const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable; +} - iree_allocator_free(host_allocator, driver); +static iree_hal_xrt_lite_driver_t* iree_hal_xrt_lite_driver_cast( + iree_hal_driver_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_driver_vtable); + return (iree_hal_xrt_lite_driver_t*)base_value; +} - IREE_TRACE_ZONE_END(z0); +void iree_hal_xrt_lite_driver_options_initialize( + iree_hal_xrt_lite_driver_options_t* out_options) { + memset(out_options, 0, sizeof(*out_options)); + + // TODO(null): set defaults based on compiler configuration. Flags should not + // be used as multiple devices may be configured within the process or the + // hosting application may be authored in python/etc that does not use a flags + // mechanism accessible here. + iree_hal_xrt_lite_device_options_initialize( + &out_options->default_device_options); } -static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( - iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, - iree_host_size_t* out_device_info_count, - iree_hal_device_info_t** out_device_infos) { - iree_hal_xrt_lite_driver_t* driver = - reinterpret_cast(base_driver); - uint64_t device_hdl = driver->device_hdl; - // Allocate the return infos and populate with the devices. - iree_hal_device_info_t* device_infos = nullptr; - iree_host_size_t single_info_size = - sizeof(iree_hal_device_info_t) + - (IREE_HAL_XRT_LITE_MAX_DEVICE_PATH_LENGTH + - IREE_HAL_XRT_LITE_MAX_DEVICE_NAME_LENGTH) * - sizeof(char); - IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, single_info_size, - (void**)&device_infos)); - - uint8_t* buffer_ptr = (uint8_t*)device_infos + sizeof(iree_hal_device_info_t); - memset(device_infos, 0, sizeof(*device_infos)); - - // device_infos->device_id = 0; - // std::string device_name = "aie2"; - // const size_t name_len = strlen(device_name.c_str()); - // if (name_len >= IREE_HAL_XRT_LITE_MAX_DEVICE_NAME_LENGTH) { - // return iree_make_status(IREE_STATUS_OUT_OF_RANGE, - // "device name out of range"); - // } - // buffer_ptr += iree_string_view_append_to_buffer( - // iree_make_string_view(device_name.c_str(), name_len), - // &device_infos->name, (char*)buffer_ptr); - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED); - - *out_device_info_count = 1; - *out_device_infos = device_infos; - return status; +static iree_status_t iree_hal_xrt_lite_driver_options_verify( + const iree_hal_xrt_lite_driver_options_t* options) { + // TODO(null): verify that the parameters are within expected ranges and any + // requested features are supported. + return iree_ok_status(); } -static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( - iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, - iree_host_size_t param_count, const iree_string_pair_t* params, - iree_allocator_t host_allocator, iree_hal_device_t** out_device) { +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( + iree_string_view_t identifier, + const iree_hal_xrt_lite_driver_options_t* options, + iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { + IREE_ASSERT_ARGUMENT(options); + IREE_ASSERT_ARGUMENT(out_driver); + *out_driver = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_driver_t* driver = - reinterpret_cast(base_driver); - iree_string_view_t device_name = iree_make_cstring_view("xrt-lite"); - // iree_status_t status = iree_hal_xrt_lite_device_create( - // device_name, &driver->device_params, host_allocator, out_device); - - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED); + // TODO(null): verify options; this may be moved after any libraries are + // loaded so the verification can use underlying implementation queries. + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_options_verify(options)); + iree_hal_xrt_lite_driver_t* driver = nullptr; + iree_host_size_t total_size = sizeof(*driver) + identifier.size; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, total_size, (void**)&driver)); + iree_hal_resource_initialize(&iree_hal_xrt_lite_driver_vtable, + &driver->resource); + driver->host_allocator = host_allocator; + iree_string_view_append_to_buffer( + identifier, &driver->identifier, + (char*)driver + total_size - identifier.size); + + // TODO(null): if there are any string fields then they will need to be + // retained as well (similar to the identifier they can be tagged on to the + // end of the driver struct). + memcpy(&driver->options, options, sizeof(*options)); + + // TODO(null): load libraries and query driver support from the system. + // Devices need not be enumerated here if doing so is expensive; the + // application may create drivers just to see if they are present but defer + // device enumeration until the user requests one. Underlying implementations + // can sometimes do bonkers static init stuff as soon as they are touched and + // this code may want to do that on-demand instead. + iree_status_t status = iree_ok_status(); + + if (iree_status_is_ok(status)) { + *out_driver = (iree_hal_driver_t*)driver; + } else { + iree_hal_driver_release((iree_hal_driver_t*)driver); + } IREE_TRACE_ZONE_END(z0); return status; } -static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( - iree_hal_driver_t* base_driver, iree_string_view_t driver_name, - iree_string_view_t device_path, iree_host_size_t param_count, - const iree_string_pair_t* params, iree_allocator_t host_allocator, - iree_hal_device_t** out_device) { - IREE_TRACE_ZONE_BEGIN(z0); +static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { iree_hal_xrt_lite_driver_t* driver = - reinterpret_cast(base_driver); - iree_string_view_t device_name = iree_make_cstring_view("xrt"); + iree_hal_xrt_lite_driver_cast(base_driver); + iree_allocator_t host_allocator = driver->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); - // iree_status_t status = iree_hal_xrt_lite_device_create( - // device_name, &driver->device_params, host_allocator, out_device); + // TODO(null): if the driver loaded any libraries they should be closed here. - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED); + iree_allocator_free(host_allocator, driver); IREE_TRACE_ZONE_END(z0); - return status; } namespace { const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable = { - /*.destroy = */ iree_hal_xrt_lite_driver_destroy, - /*.query_available_devices = */ - iree_hal_xrt_lite_driver_query_available_devices, - /*.dump_device_info = */ unimplemented, - /*.create_device_by_id = */ iree_hal_xrt_lite_driver_create_device_by_id, - /*.create_device_by_path = */ - iree_hal_xrt_lite_driver_create_device_by_path, + .destroy = iree_hal_xrt_lite_driver_destroy, + .query_available_devices = unimplemented, }; -} // namespace - -IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( - iree_string_view_t identifier, iree_allocator_t host_allocator, - iree_hal_driver_t** out_driver) { - IREE_ASSERT_ARGUMENT(out_driver); - IREE_TRACE_ZONE_BEGIN(z0); - - iree_hal_xrt_lite_driver_t* driver = nullptr; - iree_host_size_t total_size = iree_sizeof_struct(*driver) + identifier.size; - IREE_RETURN_IF_ERROR( - iree_allocator_malloc(host_allocator, total_size, (void**)&driver)); - - iree_hal_resource_initialize(&iree_hal_xrt_lite_driver_vtable, - &driver->resource); - - driver->host_allocator = host_allocator; - iree_string_view_append_to_buffer( - identifier, &driver->identifier, - (char*)driver + iree_sizeof_struct(*driver)); - - *out_driver = reinterpret_cast(driver); - - IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc deleted file mode 100644 index 12b1e614c..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.cc +++ /dev/null @@ -1,297 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree-amd-aie/driver/xrt-lite/native_executable.h" - -#include -#include - -#include "iree-amd-aie/schemas/pdi_executable_def_reader.h" -#include "iree-amd-aie/schemas/pdi_executable_def_verifier.h" -#include "iree/base/api.h" - -struct iree_hal_xrt_lite_native_executable_t { - iree_hal_resource_t resource; - iree_allocator_t host_allocator; - iree_host_size_t entry_point_count; - iree_hal_xrt_lite_kernel_info_t entry_points[]; -}; - -namespace { -extern const iree_hal_executable_vtable_t - iree_hal_xrt_lite_native_executable_vtable; -} - -static iree_hal_xrt_lite_native_executable_t* -iree_hal_xrt_lite_native_executable_cast(iree_hal_executable_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); - return reinterpret_cast(base_value); -} - -static iree_status_t iree_hal_xrt_lite_native_executable_flatbuffer_verify( - iree_const_byte_span_t flatbuffer_data) { - if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) { - return iree_make_status( - IREE_STATUS_INVALID_ARGUMENT, - "flatbuffer data is not present or less than 16 bytes (%zu total)", - flatbuffer_data.data_length); - } - - int verify_ret = iree_amd_aie_hal_xrt_lite_ExecutableDef_verify_as_root( - flatbuffer_data.data, flatbuffer_data.data_length); - if (verify_ret != flatcc_verify_ok) { - return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, - "flatbuffer verification failed: %s", - flatcc_verify_error_string(verify_ret)); - } - - iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = - iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root(flatbuffer_data.data); - - flatbuffers_string_vec_t entry_points_vec = - iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); - size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); - if (entry_point_count == 0) { - return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, - "no entry points found in the executable"); - } - - for (size_t i = 0; i < entry_point_count; ++i) { - if (!flatbuffers_string_len( - flatbuffers_string_vec_at(entry_points_vec, i))) { - return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, - "executable entry point %zu has no name", i); - } - } - - iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis = - iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); - size_t number_pdi = iree_amd_aie_hal_xrt_lite_PdiDef_vec_len(pdis); - if (number_pdi == 0) { - return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no pdi present"); - } - - iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instr = - iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); - size_t number_asm_instr = - iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_len(asm_instr); - if (number_asm_instr != entry_point_count) { - return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, - "number of entry points (%zu) and number of asm " - "instructions (%zu) mismatched", - entry_point_count, number_asm_instr); - } - - return iree_ok_status(); -} - -// iree_status_t iree_hal_xrt_lite_native_executable_create( -// const iree_hal_executable_params_t* executable_params, -// iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator, -// iree_hal_executable_t** out_executable) { -// IREE_ASSERT_ARGUMENT(device_allocator); -// IREE_ASSERT_ARGUMENT(executable_params); -// IREE_ASSERT_ARGUMENT(out_executable); -// -// IREE_TRACE_ZONE_BEGIN(z0); -// -// *out_executable = nullptr; -// iree_hal_xrt_lite_native_executable_t* executable = nullptr; -// -// IREE_RETURN_AND_END_ZONE_IF_ERROR( -// z0, iree_hal_xrt_lite_native_executable_flatbuffer_verify( -// executable_params->executable_data)); -// -// iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root( -// executable_params->executable_data.data); -// -// flatbuffers_uint32_vec_t pdi_indices_vec = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_pdi_indices_get(executable_def); -// -// flatbuffers_uint32_vec_t asm_instr_indices_vec = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instr_indices_get( -// executable_def); -// -// flatbuffers_string_vec_t entry_points_vec = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); -// -// iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis_vec = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); -// -// iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instrs_vec = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); -// -// iree_host_size_t entry_point_count = -// flatbuffers_string_vec_len(entry_points_vec); -// -// iree_host_size_t total_entry_point_name_chars = 0; -// IREE_TRACE({ -// for (iree_host_size_t entry_ordinal = 0; entry_ordinal < -// entry_point_count; -// entry_ordinal++) { -// const char* entry_name = -// flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); -// total_entry_point_name_chars += flatbuffers_string_len(entry_name); -// } -// }); -// -// iree_host_size_t total_size = -// sizeof(*executable) + -// entry_point_count * sizeof(executable->entry_points[0]) + -// total_entry_point_name_chars; -// IREE_RETURN_AND_END_ZONE_IF_ERROR( -// z0, iree_allocator_malloc(host_allocator, total_size, -// reinterpret_cast(&executable))); -// IREE_TRACE( -// char* string_table_buffer = -// (char*)((char*)executable + sizeof(*executable) + -// entry_point_count * sizeof(executable->entry_points[0]))); -// -// iree_hal_resource_initialize(&iree_hal_xrt_lite_native_executable_vtable, -// &executable->resource); -// -// executable->host_allocator = host_allocator; -// executable->entry_point_count = entry_point_count; -// -// for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; -// entry_ordinal++) { -// const char* entry_name = -// flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); -// uint32_t pdi_index = -// flatbuffers_uint32_vec_at(pdi_indices_vec, entry_ordinal); -// iree_amd_aie_hal_xrt_lite_PdiDef_table_t pdi_def = -// iree_amd_aie_hal_xrt_lite_PdiDef_vec_at(pdis_vec, pdi_index); -// flatbuffers_string_t pdi_fb = -// iree_amd_aie_hal_xrt_lite_PdiDef_pdi_get(pdi_def); -// uint32_t num_pdi_chars = flatbuffers_string_len(pdi_fb); -// uint32_t asm_instr_index = -// flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); -// iree_amd_aie_hal_xrt_lite_AsmInstDef_table_t asminst_def = -// iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_at(asm_instrs_vec, -// asm_instr_index); -// flatbuffers_uint32_vec_t asm_inst = -// iree_amd_aie_hal_xrt_lite_AsmInstDef_asm_inst_get(asminst_def); -// uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); -// -// iree_hal_xrt_lite_allocator_t* allocator = -// iree_hal_xrt_lite_allocator_cast(device_allocator); -// iree_hal_xrt_lite_kernel_info_t* params = -// &executable->entry_points[entry_ordinal]; -// params->num_instr = num_instr; -// // Load the IPU and PDI files into a global pool that doesn't support -// kernel -// // args (DEV BO). -// IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( -// z0, symbols, -// xrt_lite_amd_memory_pool_allocate( -// allocator->global_dev_mem_pool, num_instr * sizeof(uint32_t), 0, -// reinterpret_cast(¶ms->ipu_inst_buf)), -// "xrt_lite_amd_memory_pool_allocate"); -// std::memcpy(params->ipu_inst_buf, asm_inst, num_instr * -// sizeof(uint32_t)); IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( -// z0, symbols, -// xrt_lite_amd_get_handle_from_vaddr(params->ipu_inst_buf, -// ¶ms->ipu_inst_handle), -// "xrt_lite_amd_agent_iterate_memory_pools"); -// IREE_ASSERT(params->ipu_inst_handle); -// -// IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( -// z0, symbols, -// xrt_lite_amd_memory_pool_allocate( -// allocator->global_dev_mem_pool, num_pdi_chars, 0, -// reinterpret_cast(¶ms->pdi_buf)), -// "xrt_lite_amd_memory_pool_allocate"); -// std::memcpy(params->pdi_buf, pdi_fb, num_pdi_chars * sizeof(char)); -// IREE_XRT_LITE_RETURN_AND_END_ZONE_IF_ERROR( -// z0, symbols, -// xrt_lite_amd_get_handle_from_vaddr(params->pdi_buf, -// ¶ms->pdi_handle), -// "xrt_lite_amd_agent_iterate_memory_pools"); -// IREE_ASSERT(params->pdi_handle); -// -// (void)entry_name; -// IREE_TRACE({ -// iree_host_size_t entry_name_length = -// flatbuffers_string_len(entry_name); memcpy(string_table_buffer, -// entry_name, entry_name_length); string_table_buffer += -// entry_name_length; -// }); -// -// IREE_TRACE({ -// if -// (iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_is_present( -// executable_def)) { -// iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_t source_locs_vec = -// iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_get( -// executable_def); -// iree_amd_aie_hal_xrt_lite_FileLineLocDef_table_t source_loc = -// iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_at(source_locs_vec, -// entry_ordinal); -// flatbuffers_string_t filename = -// iree_amd_aie_hal_xrt_lite_FileLineLocDef_filename_get(source_loc); -// uint32_t line = -// iree_amd_aie_hal_xrt_lite_FileLineLocDef_line_get(source_loc); -// params->source_filename = -// iree_make_string_view(filename, -// flatbuffers_string_len(filename)); -// params->source_line = line; -// } -// }); -// } -// -// iree_status_t status = iree_ok_status(); -// -// if (iree_status_is_ok(status)) { -// *out_executable = reinterpret_cast(executable); -// } else { -// iree_hal_executable_destroy( -// reinterpret_cast(executable)); -// } -// -// IREE_TRACE_ZONE_END(z0); -// return status; -// } - -static void iree_hal_xrt_lite_native_executable_destroy( - iree_hal_executable_t* base_executable) { - iree_hal_xrt_lite_native_executable_t* executable = - iree_hal_xrt_lite_native_executable_cast(base_executable); - iree_allocator_t host_allocator = executable->host_allocator; - IREE_TRACE_ZONE_BEGIN(z0); - - for (iree_host_size_t entry_ordinal = 0; - entry_ordinal < executable->entry_point_count; entry_ordinal++) { - iree_hal_xrt_lite_kernel_info_t* params = - &executable->entry_points[entry_ordinal]; - } - - iree_allocator_free(host_allocator, executable); - - IREE_TRACE_ZONE_END(z0); -} - -iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_info( - iree_hal_executable_t* base_executable, int32_t entry_point, - iree_hal_xrt_lite_kernel_info_t* out_info) { - iree_hal_xrt_lite_native_executable_t* executable = - iree_hal_xrt_lite_native_executable_cast(base_executable); - if (entry_point >= executable->entry_point_count) { - return iree_make_status(IREE_STATUS_OUT_OF_RANGE, - "entry point ordinal %d out of range; executable " - "only contains %ld entry points", - entry_point, executable->entry_point_count); - } - memcpy(out_info, &executable->entry_points[entry_point], sizeof(*out_info)); - return iree_ok_status(); -} - -namespace { -const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable = - { - /*destroy=*/iree_hal_xrt_lite_native_executable_destroy, -}; -} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h deleted file mode 100644 index c2bc45b1c..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/native_executable.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ -#define IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ - -#include - -#include "iree/base/api.h" -#include "iree/base/tracing.h" -#include "iree/hal/api.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -struct iree_hal_xrt_lite_kernel_info_t { - uint32_t ipu_inst_handle; - uint32_t pdi_handle; - uint32_t* ipu_inst_buf; - char* pdi_buf; - uint32_t num_instr; - IREE_TRACE(iree_string_view_t function_name;) - IREE_TRACE(iree_string_view_t source_filename;) - IREE_TRACE(uint32_t source_line;) -}; - -iree_status_t iree_hal_xrt_lite_native_executable_create( - const iree_hal_executable_params_t* executable_params, - iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator, - iree_hal_executable_t** out_executable); - -iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_info( - iree_hal_executable_t* executable, int32_t entry_point, - iree_hal_xrt_lite_kernel_info_t* out_info); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - -#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt index f387ade2a..c6e5ad98e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt @@ -10,14 +10,13 @@ iree_cc_library( NAME registration HDRS - driver_module.h + "driver_module.h" SRCS - driver_module.c + "driver_module.c" DEPS iree::base - iree::base::core_headers - iree-amd-aie::driver::xrt-lite iree::hal + iree-amd-aie::driver::xrt-lite DEFINES "IREE_HAVE_HAL_XRT_LITE_DRIVER_MODULE=1" PUBLIC diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c index f50b2ac68..b0ba1c433 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -6,49 +6,45 @@ #include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" -#include -#include - #include "iree-amd-aie/driver/xrt-lite/api.h" #include "iree/base/api.h" -#include "iree/base/status.h" static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( void* self, iree_host_size_t* out_driver_info_count, const iree_hal_driver_info_t** out_driver_infos) { - IREE_ASSERT_ARGUMENT(out_driver_info_count); - IREE_ASSERT_ARGUMENT(out_driver_infos); - IREE_TRACE_ZONE_BEGIN(z0); - - static const iree_hal_driver_info_t driver_infos[1] = {{ + // TODO(null): return multiple drivers if desired. This information must be + // static. The list here is just what is compiled into the binary and not + // expected to actually try to load or initialize drivers. + static const iree_hal_driver_info_t default_driver_info = { .driver_name = IREE_SVL("xrt-lite"), .full_name = IREE_SVL("XRT-LITE driver (for AIE)"), - }}; - *out_driver_info_count = IREE_ARRAYSIZE(driver_infos); - *out_driver_infos = driver_infos; - - IREE_TRACE_ZONE_END(z0); - + }; + *out_driver_info_count = 1; + *out_driver_infos = &default_driver_info; return iree_ok_status(); } static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { - IREE_ASSERT_ARGUMENT(out_driver); - + // TODO(null): use your driver name - this will be the prefix when the user + // specifies the device (`--device=null://foo`). A single driver can support + // multiple prefixes if it wants. if (!iree_string_view_equal(driver_name, IREE_SV("xrt-lite"))) { return iree_make_status(IREE_STATUS_UNAVAILABLE, "no driver '%.*s' is provided by this factory", (int)driver_name.size, driver_name.data); } - IREE_TRACE_ZONE_BEGIN(z0); - - iree_status_t status = - iree_hal_xrt_lite_driver_create(driver_name, host_allocator, out_driver); + // TODO(null): populate options from flags. This driver module file is only + // used in native tools that have access to the flags library. Programmatic + // creation of the driver and devices will bypass this file and pass the + // options via this struct or key-value string parameters. + struct iree_hal_xrt_lite_driver_options_t options; + iree_hal_xrt_lite_driver_options_initialize(&options); - IREE_TRACE_ZONE_END(z0); + iree_status_t status = iree_hal_xrt_lite_driver_create( + driver_name, &options, host_allocator, out_driver); return status; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h index 5b42d7ad3..c8e81405c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h @@ -4,8 +4,8 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ -#define IREE_AMD_AIE_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ +#ifndef IREE_HAL_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ +#define IREE_HAL_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ #include "iree/base/api.h" #include "iree/hal/api.h" @@ -21,4 +21,4 @@ iree_hal_xrt_lite_driver_module_register(iree_hal_driver_registry_t* registry); } // extern "C" #endif // __cplusplus -#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ +#endif // IREE_HAL_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt deleted file mode 100644 index ac1522216..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. -# Copyright 2024 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -if(UNIX) - add_subdirectory(linux) -endif() \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt deleted file mode 100644 index c4e1e5604..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. -# Copyright 2024 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -iree_add_all_subdirs() diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt deleted file mode 100644 index 4cccb9548..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. -# Copyright 2024 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -iree_cc_library( - NAME - kmq - SRCS - amdxdna_accel.h - bo.cpp - bo.h - device.cpp - device.h - fence.cpp - fence.h - hwctx.cpp - hwctx.h - hwq.cpp - hwq.h - pcidev.cpp - pcidev.h - pcidrv.cpp - pcidrv.h - shared.h - shim_debug.cpp - shim_debug.h - PUBLIC -) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h deleted file mode 100644 index d4d18d6b3..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h +++ /dev/null @@ -1,591 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. - */ - -#ifndef AMDXDNA_ACCEL_H_ -#define AMDXDNA_ACCEL_H_ - -#include -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - -#define AMDXDNA_DRIVER_MAJOR 1 -#define AMDXDNA_DRIVER_MINOR 0 - -#define AMDXDNA_INVALID_ADDR (~0UL) -#define AMDXDNA_INVALID_CTX_HANDLE 0 -#define AMDXDNA_INVALID_BO_HANDLE 0 -#define AMDXDNA_INVALID_FENCE_HANDLE 0 - -/* - * The interface can grow/extend over time. - * On each struct amdxdna_drm_*, to support potential extension, we defined it - * like this. - * - * Example code: - * - * struct amdxdna_drm_example_data { - * .ext = (uintptr_t)&example_data_ext; - * ... - * }; - * - * We don't have extension now. The extension struct will define in the future. - */ - -enum amdxdna_drm_ioctl_id { - DRM_AMDXDNA_CREATE_HWCTX, - DRM_AMDXDNA_DESTROY_HWCTX, - DRM_AMDXDNA_CONFIG_HWCTX, - DRM_AMDXDNA_CREATE_BO, - DRM_AMDXDNA_GET_BO_INFO, - DRM_AMDXDNA_SYNC_BO, - DRM_AMDXDNA_EXEC_CMD, - DRM_AMDXDNA_WAIT_CMD, - DRM_AMDXDNA_GET_INFO, - DRM_AMDXDNA_SET_STATE, - DRM_AMDXDNA_NUM_IOCTLS -}; - -enum amdxdna_device_type { - AMDXDNA_DEV_TYPE_UNKNOWN = -1, - AMDXDNA_DEV_TYPE_KMQ, - AMDXDNA_DEV_TYPE_UMQ, -}; - -/** - * struct qos_info - QoS information for driver. - * @gops: Giga operations per second. - * @fps: Frames per second. - * @dma_bandwidth: DMA bandwidtha. - * @latency: Frame response latency. - * @frame_exec_time: Frame execution time. - * @priority: Request priority. - * - * User program can provide QoS hints to driver. - */ -struct amdxdna_qos_info { - __u32 gops; - __u32 fps; - __u32 dma_bandwidth; - __u32 latency; - __u32 frame_exec_time; - __u32 priority; -}; - -/** - * struct amdxdna_drm_create_hwctx - Create hardware context. - * @ext: MBZ. - * @ext_flags: MBZ. - * @qos_p: Address of QoS info. - * @umq_bo: BO handle for user mode queue(UMQ). - * @log_buf_bo: BO handle for log buffer. - * @max_opc: Maximum operations per cycle. - * @num_tiles: Number of AIE tiles. - * @mem_size: Size of AIE tile memory. - * @umq_doorbell: Returned offset of doorbell associated with UMQ. - * @handle: Returned hardware context handle. - */ -struct amdxdna_drm_create_hwctx { - __u64 ext; - __u64 ext_flags; - __u64 qos_p; - __u32 umq_bo; - __u32 log_buf_bo; - __u32 max_opc; - __u32 num_tiles; - __u32 mem_size; - __u32 umq_doorbell; - __u32 handle; -}; - -/** - * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. - * @handle: Hardware context handle. - * @pad: MBZ. - */ -struct amdxdna_drm_destroy_hwctx { - __u32 handle; - __u32 pad; -}; - -/** - * struct amdxdna_cu_config - configuration for one CU - * @cu_bo: CU configuration buffer bo handle - * @cu_func: Functional of a CU - * @pad: MBZ - */ -struct amdxdna_cu_config { - __u32 cu_bo; - __u8 cu_func; - __u8 pad[3]; -}; - -/** - * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware - * context - * @num_cus: Number of CUs to configure - * @pad: MBZ - * @cu_configs: Array of CU configurations of struct amdxdna_cu_config - */ -// struct amdxdna_hwctx_param_config_cu { -// __u16 num_cus; -// __u16 pad[3]; -// struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); -// }; - -enum amdxdna_drm_config_hwctx_param { - DRM_AMDXDNA_HWCTX_CONFIG_CU, - DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, - DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, - DRM_AMDXDNA_HWCTX_CONFIG_NUM -}; - -/** - * struct amdxdna_drm_config_hwctx - Configure hardware context. - * @handle: hardware context handle. - * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the - * structure passed in via param_val. - * @param_val: A structure specified by the param_type struct member. - * @param_val_size: Size of the parameter buffer pointed to by the param_val. - * If param_val is not a pointer, driver can ignore this. - * - * Note: if the param_val is a pointer pointing to a buffer, the maximum size - * of the buffer is 4KiB(PAGE_SIZE). - */ -struct amdxdna_drm_config_hwctx { - __u32 handle; - __u32 param_type; - __u64 param_val; - __u32 param_val_size; - __u32 pad; -}; - -/* - * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo - * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory - * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP - * AMDXDNA_BO_CMD: User and driver accessible bo - * AMDXDNA_BO_DMA: DRM GEM DMA bo - */ -enum amdxdna_bo_type { - AMDXDNA_BO_INVALID = 0, - AMDXDNA_BO_SHMEM, - AMDXDNA_BO_DEV_HEAP, - AMDXDNA_BO_DEV, - AMDXDNA_BO_CMD, - AMDXDNA_BO_DMA, -}; - -/** - * struct amdxdna_drm_create_bo - Create a buffer object. - * @flags: Buffer flags. MBZ. - * @type: Buffer type. - * @vaddr: User VA of buffer if applied. MBZ. - * @size: Size in bytes. - * @handle: Returned DRM buffer object handle. - */ -struct amdxdna_drm_create_bo { - __u64 flags; - __u32 type; - __u32 _pad; - __u64 vaddr; - __u64 size; - __u32 handle; -}; - -/** - * struct amdxdna_drm_get_bo_info - Get buffer object information. - * @ext: MBZ. - * @ext_flags: MBZ. - * @handle: DRM buffer object handle. - * @map_offset: Returned DRM fake offset for mmap(). - * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). - * @xdna_addr: Returned XDNA device virtual address. - */ -struct amdxdna_drm_get_bo_info { - __u64 ext; - __u64 ext_flags; - __u32 handle; - __u32 _pad; - __u64 map_offset; - __u64 vaddr; - __u64 xdna_addr; -}; - -/** - * struct amdxdna_drm_sync_bo - Sync buffer object. - * @handle: Buffer object handle. - * @direction: Direction of sync, can be from device or to device. - * @offset: Offset in the buffer to sync. - * @size: Size in bytes. - */ -struct amdxdna_drm_sync_bo { - __u32 handle; -#define SYNC_DIRECT_TO_DEVICE 0U -#define SYNC_DIRECT_FROM_DEVICE 1U - __u32 direction; - __u64 offset; - __u64 size; -}; - -enum amdxdna_cmd_type { - AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, - AMDXDNA_CMD_SUBMIT_DEPENDENCY, - AMDXDNA_CMD_SUBMIT_SIGNAL, -}; - -/** - * struct amdxdna_drm_exec_cmd - Execute command. - * @ext: MBZ. - * @ext_flags: MBZ. - * @hwctx: Hardware context handle. - * @type: One of command type in enum amdxdna_cmd_type. - * @cmd_handles: Array of command handles or the command handle itself in case - * of just one. - * @args: Array of arguments for all command handles. - * @cmd_count: Number of command handles in the cmd_handles array. - * @arg_count: Number of arguments in the args array. - * @seq: Returned sequence number for this command. - */ -struct amdxdna_drm_exec_cmd { - __u64 ext; - __u64 ext_flags; - __u32 hwctx; - __u32 type; - __u64 cmd_handles; - __u64 args; - __u32 cmd_count; - __u32 arg_count; - __u64 seq; -}; - -/** - * struct amdxdna_drm_wait_cmd - Wait exectuion command. - * - * @hwctx: hardware context handle. - * @timeout: timeout in ms, 0 implies infinite wait. - * @seq: sequence number of the command returned by execute command. - * - * Wait a command specified by seq to be completed. - */ -struct amdxdna_drm_wait_cmd { - __u32 hwctx; - __u32 timeout; - __u64 seq; -}; - -/** - * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware - * @buffer: The user space buffer that will return the AIE status - * @buffer_size: The size of the user space buffer - * @cols_filled: A bitmap of AIE columns whose data has been returned in the - * buffer. - */ -struct amdxdna_drm_query_aie_status { - __u64 buffer; /* out */ - __u32 buffer_size; /* in */ - __u32 cols_filled; /* out */ -}; - -/** - * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware - * @major: The major version number - * @minor: The minor version number - */ -struct amdxdna_drm_query_aie_version { - __u32 major; /* out */ - __u32 minor; /* out */ -}; - -/** - * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile - * (core, mem, shim) - * @row_count: The number of rows. - * @row_start: The starting row number. - * @dma_channel_count: The number of dma channels. - * @lock_count: The number of locks. - * @event_reg_count: The number of events. - * @pad: MBZ. - */ -struct amdxdna_drm_query_aie_tile_metadata { - __u16 row_count; - __u16 row_start; - __u16 dma_channel_count; - __u16 lock_count; - __u16 event_reg_count; - __u16 pad[3]; -}; - -/** - * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE - * hardware - * @col_size: The size of a column in bytes. - * @cols: The total number of columns. - * @rows: The total number of rows. - * @version: The version of the AIE hardware. - * @core: The metadata for all core tiles. - * @mem: The metadata for all mem tiles. - * @shim: The metadata for all shim tiles. - */ -struct amdxdna_drm_query_aie_metadata { - __u32 col_size; - __u16 cols; - __u16 rows; - struct amdxdna_drm_query_aie_version version; - struct amdxdna_drm_query_aie_tile_metadata core; - struct amdxdna_drm_query_aie_tile_metadata mem; - struct amdxdna_drm_query_aie_tile_metadata shim; -}; - -/** - * struct amdxdna_drm_query_clock - Metadata for a clock - * @name: The clock name. - * @freq_mhz: The clock frequency. - * @pad: MBZ. - */ -struct amdxdna_drm_query_clock { - __u8 name[16]; - __u32 freq_mhz; - __u32 pad; -}; - -/** - * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks - * @mp_npu_clock: The metadata for MP-NPU clock. - * @h_clock: The metadata for H clock. - */ -struct amdxdna_drm_query_clock_metadata { - struct amdxdna_drm_query_clock mp_npu_clock; - struct amdxdna_drm_query_clock h_clock; -}; - -enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; - -/** - * struct amdxdna_drm_query_sensor - The data for single sensor. - * @label: The name for a sensor. - * @input: The current value of the sensor. - * @max: The maximum value possible for the sensor. - * @average: The average value of the sensor. - * @highest: The highest recorded sensor value for this driver load for the - * sensor. - * @status: The sensor status. - * @units: The sensor units. - * @unitm: Translates value member variables into the correct unit via (pow(10, - * unitm) * value) - * @type: The sensor type from enum amdxdna_sensor_type - * @pad: MBZ. - */ -struct amdxdna_drm_query_sensor { - __u8 label[64]; - __u32 input; - __u32 max; - __u32 average; - __u32 highest; - __u8 status[64]; - __u8 units[16]; - __s8 unitm; - __u8 type; - __u8 pad[6]; -}; - -/** - * struct amdxdna_drm_query_hwctx - The data for single context. - * @context_id: The ID for this context. - * @start_col: The starting column for the partition assigned to this context. - * @num_col: The number of columns in the partition assigned to this context. - * @pid: The Process ID of the process that created this context. - * @command_submissions: The number of commands submitted to this context. - * @command_completions: The number of commands completed by this context. - * @migrations: The number of times this context has been moved to a different - * partition. - * @preemptions: The number of times this context has been preempted by another - * context in the same partition. - * @pad: MBZ. - */ -struct amdxdna_drm_query_hwctx { - __u32 context_id; - __u32 start_col; - __u32 num_col; - __u32 pad; - __s64 pid; - __u64 command_submissions; - __u64 command_completions; - __u64 migrations; - __u64 preemptions; - __u64 errors; -}; - -/** - * struct amdxdna_drm_aie_mem - The data for AIE memory read/write - * @col: The AIE column index - * @row: The AIE row index - * @addr: The AIE memory address to read/write - * @size: The size of bytes to read/write - * @buf_p: The buffer to store read/write data - * - * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM - * parameters. - */ -struct amdxdna_drm_aie_mem { - __u32 col; - __u32 row; - __u32 addr; - __u32 size; - __u64 buf_p; -}; - -/** - * struct amdxdna_drm_aie_reg - The data for AIE register read/write - * @col: The AIE column index - * @row: The AIE row index - * @addr: The AIE register address to read/write - * @val: The value to write or returned value from AIE - * - * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG - * parameters. - */ -struct amdxdna_drm_aie_reg { - __u32 col; - __u32 row; - __u32 addr; - __u32 val; -}; - -enum amdxdna_power_mode_type { - POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ - POWER_MODE_LOW, /**< Set frequency to lowest DPM */ - POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ - POWER_MODE_HIGH, /**< Set frequency to highest DPM */ - POWER_MODE_TURBO, /**< More power, more performance */ -}; - -/** - * struct amdxdna_drm_get_power_mode - Get the power mode of the AIE hardware - * @power_mode: The sensor type from enum amdxdna_power_mode_type - * @pad: MBZ. - */ -struct amdxdna_drm_get_power_mode { - __u8 power_mode; - __u8 pad[7]; -}; - -/** - * struct amdxdna_drm_query_firmware_version - Query the version of the firmware - * @major: The major version number - * @minor: The minor version number - * @patch: The patch level version number - * @build: The build ID - */ -struct amdxdna_drm_query_firmware_version { - __u32 major; /* out */ - __u32 minor; /* out */ - __u32 patch; /* out */ - __u32 build; /* out */ -}; - -enum amdxdna_drm_get_param { - DRM_AMDXDNA_QUERY_AIE_STATUS, - DRM_AMDXDNA_QUERY_AIE_METADATA, - DRM_AMDXDNA_QUERY_AIE_VERSION, - DRM_AMDXDNA_QUERY_CLOCK_METADATA, - DRM_AMDXDNA_QUERY_SENSORS, - DRM_AMDXDNA_QUERY_HW_CONTEXTS, - DRM_AMDXDNA_READ_AIE_MEM, - DRM_AMDXDNA_READ_AIE_REG, - DRM_AMDXDNA_QUERY_FIRMWARE_VERSION, - DRM_AMDXDNA_GET_POWER_MODE, - DRM_AMDXDNA_QUERY_TELEMETRY, - DRM_AMDXDNA_NUM_GET_PARAM, -}; - -/** - * struct amdxdna_drm_get_info - Get some information from the AIE hardware. - * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed - * in the buffer. - * @buffer_size: Size of the input buffer. Size needed/written by the kernel. - * @buffer: A structure specified by the param struct member. - */ -struct amdxdna_drm_get_info { - __u32 param; /* in */ - __u32 buffer_size; /* in/out */ - __u64 buffer; /* in/out */ -}; - -/** - * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware - * @power_mode: The sensor type from enum amdxdna_power_mode_type - * @pad: MBZ. - */ -struct amdxdna_drm_set_power_mode { - __u8 power_mode; - __u8 pad[7]; -}; - -enum amdxdna_drm_set_param { - DRM_AMDXDNA_SET_POWER_MODE, - DRM_AMDXDNA_WRITE_AIE_MEM, - DRM_AMDXDNA_WRITE_AIE_REG, - DRM_AMDXDNA_NUM_SET_PARAM, -}; - -/** - * struct amdxdna_drm_set_state - Set the state of some component within the AIE - * hardware. - * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed - * in the buffer. - * @buffer_size: Size of the input buffer. - * @buffer: A structure specified by the param struct member. - */ -struct amdxdna_drm_set_state { - __u32 param; /* in */ - __u32 buffer_size; /* in */ - __u64 buffer; /* in */ -}; - -#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ - struct amdxdna_drm_create_hwctx) - -#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ - struct amdxdna_drm_destroy_hwctx) - -#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ - struct amdxdna_drm_config_hwctx) - -#define DRM_IOCTL_AMDXDNA_CREATE_BO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ - struct amdxdna_drm_create_bo) - -#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ - struct amdxdna_drm_get_bo_info) - -#define DRM_IOCTL_AMDXDNA_SYNC_BO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) - -#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) - -#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) - -#define DRM_IOCTL_AMDXDNA_GET_INFO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) - -#define DRM_IOCTL_AMDXDNA_SET_STATE \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ - struct amdxdna_drm_set_state) - -#if defined(__cplusplus) -} /* extern c end */ -#endif - -#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp deleted file mode 100644 index 739456ccf..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ /dev/null @@ -1,444 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "bo.h" - -#include -#include -#include -#include -#include - -#include "pcidev.h" -#include "shim_debug.h" - -namespace { - -uint32_t alloc_drm_bo(const shim_xdna::pdev& dev, amdxdna_bo_type type, - void* buf, size_t size) { - amdxdna_drm_create_bo cbo = { - .type = static_cast(type), - .vaddr = reinterpret_cast(buf), - .size = size, - }; - dev.ioctl(DRM_IOCTL_AMDXDNA_CREATE_BO, &cbo); - return cbo.handle; -} - -void free_drm_bo(const shim_xdna::pdev& dev, uint32_t boh) { - drm_gem_close close_bo = {boh, 0}; - dev.ioctl(DRM_IOCTL_GEM_CLOSE, &close_bo); -} - -void get_drm_bo_info(const shim_xdna::pdev& dev, uint32_t boh, - amdxdna_drm_get_bo_info* bo_info) { - bo_info->handle = boh; - dev.ioctl(DRM_IOCTL_AMDXDNA_GET_BO_INFO, bo_info); -} - -void* map_parent_range(size_t size) { - auto p = ::mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0); - if (!p) shim_xdna::shim_err(errno, "mmap(len=%ld) failed", size); - - return p; -} - -void* map_drm_bo(const shim_xdna::pdev& dev, size_t size, int prot, - uint64_t offset) { - return dev.mmap(0, size, prot, MAP_SHARED | MAP_LOCKED, offset); -} - -void* map_drm_bo(const shim_xdna::pdev& dev, void* addr, size_t size, int prot, - int flags, uint64_t offset) { - return dev.mmap(addr, size, prot, flags, offset); -} - -void unmap_drm_bo(const shim_xdna::pdev& dev, void* addr, size_t size) { - dev.munmap(addr, size); -} - -void attach_dbg_drm_bo(const shim_xdna::pdev& dev, uint32_t boh, - uint32_t ctx_id) { - amdxdna_drm_config_hwctx adbo = { - .handle = ctx_id, - .param_type = DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, - .param_val = boh, - }; - dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); -} - -void detach_dbg_drm_bo(const shim_xdna::pdev& dev, uint32_t boh, - uint32_t ctx_id) { - amdxdna_drm_config_hwctx adbo = { - .handle = ctx_id, - .param_type = DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, - .param_val = boh, - }; - dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); -} - -int export_drm_bo(const shim_xdna::pdev& dev, uint32_t boh) { - drm_prime_handle exp_bo = {boh, DRM_RDWR | DRM_CLOEXEC, -1}; - dev.ioctl(DRM_IOCTL_PRIME_HANDLE_TO_FD, &exp_bo); - return exp_bo.fd; -} - -uint32_t import_drm_bo(const shim_xdna::pdev& dev, - const shim_xdna::shared_handle& share, - amdxdna_bo_type* type, size_t* size) { - shim_xdna::shared_handle::export_handle fd = share.get_export_handle(); - drm_prime_handle imp_bo = {AMDXDNA_INVALID_BO_HANDLE, 0, fd}; - dev.ioctl(DRM_IOCTL_PRIME_FD_TO_HANDLE, &imp_bo); - - *type = AMDXDNA_BO_SHMEM; - *size = lseek(fd, 0, SEEK_END); - lseek(fd, 0, SEEK_SET); - - return imp_bo.handle; -} - -bool is_power_of_two(size_t x) { return (x > 0) && ((x & (x - 1)) == 0); } - -void* addr_align(void* p, size_t align) { - if (!is_power_of_two(align)) - shim_xdna::shim_err(EINVAL, "Alignment 0x%lx is not power of two", align); - - return (void*)(((uintptr_t)p + align) & ~(align - 1)); -} - -amdxdna_bo_type flag_to_type(uint64_t bo_flags) { - auto flags = shim_xdna::xcl_bo_flags{bo_flags}; - auto boflags = (static_cast(flags.boflags) << 24); - switch (boflags) { - case XCL_BO_FLAGS_NONE: - case XCL_BO_FLAGS_HOST_ONLY: - return AMDXDNA_BO_SHMEM; - case XCL_BO_FLAGS_CACHEABLE: - return AMDXDNA_BO_DEV; - case XCL_BO_FLAGS_EXECBUF: - return AMDXDNA_BO_CMD; - default: - break; - } - return AMDXDNA_BO_INVALID; -} - -// flash cache line for non coherence memory -inline void clflush_data(const void* base, size_t offset, size_t len) { - static long cacheline_size = 0; - - if (!cacheline_size) { - long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - if (sz <= 0) - shim_xdna::shim_err(EINVAL, "Invalid cache line size: %ld", sz); - cacheline_size = sz; - } - - const char* cur = (const char*)base; - cur += offset; - uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1); - do { - _mm_clflush(cur); - cur += cacheline_size; - } while (cur <= (const char*)lastline); -} - -void sync_drm_bo(const shim_xdna::pdev& dev, uint32_t boh, - shim_xdna::bo::direction dir, size_t offset, size_t len) { - amdxdna_drm_sync_bo sbo = { - .handle = boh, - .direction = (dir == shim_xdna::bo::direction::host2device - ? SYNC_DIRECT_TO_DEVICE - : SYNC_DIRECT_FROM_DEVICE), - .offset = offset, - .size = len, - }; - dev.ioctl(DRM_IOCTL_AMDXDNA_SYNC_BO, &sbo); -} - -bool is_driver_sync() { - static int drv_sync = -1; - - if (drv_sync == -1) { - bool ds = std::getenv("Debug.force_driver_sync"); - drv_sync = ds ? 1 : 0; - } - return drv_sync == 1; -} - -} // namespace - -namespace shim_xdna { - -bo::drm_bo::drm_bo(bo& parent, const amdxdna_drm_get_bo_info& bo_info) - : m_parent(parent), - m_handle(bo_info.handle), - m_map_offset(bo_info.map_offset), - m_vaddr(bo_info.vaddr), - m_xdna_addr(bo_info.xdna_addr) {} - -bo::drm_bo::~drm_bo() { - if (m_handle == AMDXDNA_INVALID_BO_HANDLE) return; - free_drm_bo(m_parent.m_pdev, m_handle); -} - -std::string bo::type_to_name() const { - switch (m_type) { - case AMDXDNA_BO_SHMEM: - return std::string("AMDXDNA_BO_SHMEM"); - case AMDXDNA_BO_DEV_HEAP: - return std::string("AMDXDNA_BO_DEV_HEAP"); - case AMDXDNA_BO_DEV: - if (xcl_bo_flags{m_flags}.use == XRT_BO_USE_DEBUG) - return std::string("AMDXDNA_BO_DEV_DEBUG"); - return std::string("AMDXDNA_BO_DEV"); - case AMDXDNA_BO_CMD: - return std::string("AMDXDNA_BO_CMD"); - } - return std::string("BO_UNKNOWN"); -} - -std::string bo::describe() const { - std::string desc = "type="; - desc += type_to_name(); - desc += ", "; - desc += "drm_bo="; - desc += std::to_string(m_bo->m_handle); - desc += ", "; - desc += "size="; - desc += std::to_string(m_aligned_size); - return desc; -} - -void bo::mmap_bo(size_t align) { - size_t a = align; - - if (m_bo->m_map_offset == AMDXDNA_INVALID_ADDR) { - m_aligned = reinterpret_cast(m_bo->m_vaddr); - return; - } - - if (a == 0) { - m_aligned = map_drm_bo(m_pdev, m_aligned_size, PROT_READ | PROT_WRITE, - m_bo->m_map_offset); - return; - } - - /* - * Handle special alignment - * The first mmap() is just for reserved a range in user vritual address - * space. The second mmap() uses an aligned addr as the first argument in mmap - * syscall. - */ - m_parent_size = align * 2 - 1; - m_parent = map_parent_range(m_parent_size); - auto aligned = addr_align(m_parent, align); - m_aligned = - map_drm_bo(m_pdev, aligned, m_aligned_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, m_bo->m_map_offset); -} - -void bo::munmap_bo() { - shim_debug("Unmap BO, aligned %p parent %p", m_aligned, m_parent); - if (m_bo->m_map_offset == AMDXDNA_INVALID_ADDR) return; - - unmap_drm_bo(m_pdev, m_aligned, m_aligned_size); - if (m_parent) unmap_drm_bo(m_pdev, m_parent, m_parent_size); -} - -void bo::alloc_bo() { - uint32_t boh = alloc_drm_bo(m_pdev, m_type, NULL, m_aligned_size); - - amdxdna_drm_get_bo_info bo_info = {}; - get_drm_bo_info(m_pdev, boh, &bo_info); - m_bo = std::make_unique(*this, bo_info); -} - -void bo::import_bo() { - uint32_t boh = import_drm_bo(m_pdev, m_import, &m_type, &m_aligned_size); - - amdxdna_drm_get_bo_info bo_info = {}; - get_drm_bo_info(m_pdev, boh, &bo_info); - m_bo = std::make_unique(*this, bo_info); -} - -void bo::free_bo() { m_bo.reset(); } - -bo::properties bo::get_properties() const { - return {m_flags, m_aligned_size, get_paddr(), get_drm_bo_handle()}; -} - -void* bo::map(bo::map_type type) { - if (type != bo::map_type::write) - shim_err( - EINVAL, - "Not support map BO as readonly. Type must be bo::map_type::write"); - return m_aligned; -} - -void bo::unmap(void* addr) {} - -uint64_t bo::get_paddr() const { - if (m_bo->m_xdna_addr != AMDXDNA_INVALID_ADDR) return m_bo->m_xdna_addr; - return reinterpret_cast(m_aligned); -} - -void bo::set_cmd_id(uint64_t id) { m_cmd_id = id; } - -uint64_t bo::get_cmd_id() const { return m_cmd_id; } - -uint32_t bo::get_drm_bo_handle() const { return m_bo->m_handle; } - -void bo::attach_to_ctx() { - if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; - - auto boh = get_drm_bo_handle(); - shim_debug("Attaching drm_bo %d to ctx: %d", boh, m_owner_ctx_id); - attach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); -} - -void bo::detach_from_ctx() { - if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; - - auto boh = get_drm_bo_handle(); - shim_debug("Detaching drm_bo %d from ctx: %d", boh, m_owner_ctx_id); - detach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); -} - -std::unique_ptr bo::share() const { - auto boh = get_drm_bo_handle(); - auto fd = export_drm_bo(m_pdev, boh); - shim_debug("Exported bo %d to fd %d", boh, fd); - return std::make_unique(fd); -} - -amdxdna_bo_type bo::get_type() const { return m_type; } - -bo::bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags) - : bo(device, ctx_id, size, flags, flag_to_type(flags)) { - if (m_type == AMDXDNA_BO_INVALID) - shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); -} - -bo::bo(const device& device, size_t size, amdxdna_bo_type type) - : bo(device, AMDXDNA_INVALID_CTX_HANDLE, size, 0, type) {} - -bo::bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags, - amdxdna_bo_type type) - : m_pdev(device.get_pdev()), - m_aligned_size(size), - m_flags(flags), - m_type(type), - m_import(-1), - m_owner_ctx_id(ctx_id) { - size_t align = 0; - - if (m_type == AMDXDNA_BO_DEV_HEAP) - align = 64 * 1024 * 1024; // Device mem heap must align at 64MB boundary. - - alloc_bo(); - mmap_bo(align); - - // Newly allocated buffer may contain dirty pages. If used as output buffer, - // the data in cacheline will be flushed onto memory and pollute the output - // from device. We perform a cache flush right after the BO is allocated to - // avoid this issue. - if (m_type == AMDXDNA_BO_SHMEM) sync(direction::host2device, size, 0); - - attach_to_ctx(); - - shim_debug( - "Allocated KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " - "drm_bo=%d)", - m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); -} - -bo::bo(const device& device, shared_handle::export_handle ehdl) - : m_pdev(device.get_pdev()), m_import(ehdl) { - import_bo(); - mmap_bo(); - shim_debug( - "Imported KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " - "drm_bo=%d)", - m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); -} - -bo::~bo() { - shim_debug("Freeing KMQ BO, %s", describe().c_str()); - - munmap_bo(); - detach_from_ctx(); - // If BO is in use, we should block and wait in driver - free_bo(); -} - -void bo::sync(direction dir, size_t size, size_t offset) { - if (is_driver_sync()) { - sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); - return; - } - - if (offset + size > m_aligned_size) - shim_err(EINVAL, "Invalid BO offset and size for sync'ing: %ld, %ld", - offset, size); - - switch (m_type) { - case AMDXDNA_BO_SHMEM: - case AMDXDNA_BO_CMD: - clflush_data(m_aligned, offset, size); - break; - case AMDXDNA_BO_DEV: - if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) - clflush_data(m_aligned, offset, size); - else - sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); - break; - default: - shim_err(ENOTSUP, "Can't sync bo type %d", m_type); - } -} - -void bo::bind_at(size_t pos, const bo* bh, size_t offset, size_t size) { - auto boh = reinterpret_cast(bh); - std::lock_guard lg(m_args_map_lock); - - if (m_type != AMDXDNA_BO_CMD) - shim_err(EINVAL, "Can't call bind_at() on non-cmd BO"); - - if (!pos) m_args_map.clear(); - - if (boh->get_type() != AMDXDNA_BO_CMD) { - auto h = boh->get_drm_bo_handle(); - m_args_map[pos] = h; - shim_debug("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); - } else { - const size_t max_args_order = 6; - const size_t max_args = 1 << max_args_order; - size_t key = pos << max_args_order; - uint32_t hs[max_args]; - auto arg_cnt = boh->get_arg_bo_handles(hs, max_args); - std::string bohs; - for (int i = 0; i < arg_cnt; i++) { - m_args_map[key + i] = hs[i]; - bohs += std::to_string(hs[i]) + " "; - } - shim_debug("Added arg BO %s to cmd BO %d", bohs.c_str(), - get_drm_bo_handle()); - } -} - -uint32_t bo::get_arg_bo_handles(uint32_t* handles, size_t num) const { - std::lock_guard lg(m_args_map_lock); - - auto sz = m_args_map.size(); - if (sz > num) - shim_err(E2BIG, "There are %ld BO args, provided buffer can hold only %ld", - sz, num); - - for (auto m : m_args_map) *(handles++) = m.second; - - return sz; -} - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h deleted file mode 100644 index ea163db45..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef _BO_XDNA_H_ -#define _BO_XDNA_H_ - -#include -#include -#include -#include - -#include "amdxdna_accel.h" -#include "shared.h" -#include "shim_debug.h" - -namespace shim_xdna { - -#define XRT_BO_USE_NORMAL 0 -#define XRT_BO_USE_DEBUG 1 - -/** - * XCL BO Flags bits layout - * - * bits 0 ~ 15: DDR BANK index - * bits 24 ~ 31: BO flags - */ -#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) -#define XCL_BO_FLAGS_NONE (0) -#define XCL_BO_FLAGS_CACHEABLE (1U << 24) -#define XCL_BO_FLAGS_KERNBUF (1U << 25) -#define XCL_BO_FLAGS_SGL (1U << 26) -#define XCL_BO_FLAGS_SVM (1U << 27) -#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) -#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) -#define XCL_BO_FLAGS_P2P (1U << 30) -#define XCL_BO_FLAGS_EXECBUF (1U << 31) - -/** - * Encoding of flags passed to xcl buffer allocation APIs - */ -struct xcl_bo_flags { - union { - uint64_t all; // [63-0] - - struct { - uint32_t flags; // [31-0] - uint32_t extension; // [63-32] - }; - - struct { - uint16_t bank; // [15-0] - uint8_t slot; // [23-16] - uint8_t boflags; // [31-24] - - // extension - uint32_t access : 2; // [33-32] - uint32_t dir : 2; // [35-34] - uint32_t use : 1; // [36] - uint32_t unused : 27; // [63-35] - }; - }; -}; - -struct device; -struct pdev; - -struct bo { - // map_type - determines how a buffer is mapped - enum class map_type { read, write }; - - enum xclBOSyncDirection { - XCL_BO_SYNC_BO_TO_DEVICE = 0, - XCL_BO_SYNC_BO_FROM_DEVICE, - XCL_BO_SYNC_BO_GMIO_TO_AIE, - XCL_BO_SYNC_BO_AIE_TO_GMIO, - }; - - // direction - direction of sync operation - enum class direction { - host2device = XCL_BO_SYNC_BO_TO_DEVICE, - device2host = XCL_BO_SYNC_BO_FROM_DEVICE, - }; - - // properties - buffer details - struct properties { - uint64_t flags; // flags of bo - uint64_t size; // size of bo - uint64_t paddr; // physical address - uint64_t kmhdl; // kernel mode handle - }; - - using uint32_t = uint32_t; - bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags, - amdxdna_bo_type type); - - bo(const device& device, uint32_t ctx_id, size_t size, uint64_t flags); - - bo(const device& device, shared_handle::export_handle ehdl); - - ~bo(); - - void* map(map_type); - - void unmap(void* addr); - - void sync(direction, size_t size, size_t offset); - - properties get_properties() const; - - std::unique_ptr share() const; - - void copy(const bo* src, size_t size, size_t dst_offset, size_t src_offset) { - shim_not_supported_err(__func__); - } - - // For cmd BO only - void set_cmd_id(uint64_t id); - // For cmd BO only - uint64_t get_cmd_id() const; - - uint32_t get_drm_bo_handle() const; - - amdxdna_bo_type get_type() const; - - // DRM BO managed by driver. - struct drm_bo { - public: - bo& m_parent; - uint32_t m_handle = AMDXDNA_INVALID_BO_HANDLE; - off_t m_map_offset = AMDXDNA_INVALID_ADDR; - uint64_t m_xdna_addr = AMDXDNA_INVALID_ADDR; - uint64_t m_vaddr = AMDXDNA_INVALID_ADDR; - - drm_bo(bo& parent, const amdxdna_drm_get_bo_info& bo_info); - ~drm_bo(); - }; - - std::string describe() const; - - // Alloc DRM BO from driver - void alloc_bo(); - - // Import DRM BO from m_import shared object - void import_bo(); - - // Free DRM BO in driver - void free_bo(); - - void mmap_bo(size_t align = 0); - - void munmap_bo(); - - uint64_t get_paddr() const; - - std::string type_to_name() const; - - void attach_to_ctx(); - - void detach_from_ctx(); - - const pdev& m_pdev; - void* m_parent = nullptr; - void* m_aligned = nullptr; - size_t m_parent_size = 0; - size_t m_aligned_size = 0; - uint64_t m_flags = 0; - amdxdna_bo_type m_type = AMDXDNA_BO_INVALID; - std::unique_ptr m_bo; - const shared_handle m_import; - - // Command ID in the queue after command submission. - // Only valid for cmd BO. - uint64_t m_cmd_id = -1; - - // Used when exclusively assigned to a HW context. By default, BO is shared - // among all HW contexts. - uint32_t m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; - - void bind_at(size_t pos, const bo* bh, size_t offset, size_t size); - - // Support BO creation from internal - bo(const device& device, size_t size, amdxdna_bo_type type); - - // Obtain array of arg BO handles, returns real number of handles - uint32_t get_arg_bo_handles(uint32_t* handles, size_t num) const; - - // Only for AMDXDNA_BO_CMD type - std::map m_args_map; - mutable std::mutex m_args_map_lock; -}; - -} // namespace shim_xdna - -#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp deleted file mode 100644 index 69676385c..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. - All rights reserved - -#include "device.h" - -#include - -#include "bo.h" -#include "hwctx.h" -#include "pcidev.h" - -namespace shim_xdna { - -device::device(const pdev& pdev, void* shim_handle) - : m_pdev(pdev), m_handle(shim_handle) { - shim_debug("Created KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str()); -} - -device::~device() { - shim_debug("Destroying KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str()); - m_pdev.close(); -} - -// std::unique_ptr device::create_hw_context( -// const device& dev, const hw_ctx::qos_type& qos) const { -// return std::make_unique(dev, qos); -// } - -std::unique_ptr device::alloc_bo(void* userptr, uint32_t ctx_id, - size_t size, uint64_t flags) { - if (userptr) shim_not_supported_err("User ptr BO"); - - auto b = bo(*this, ctx_id, size, flags); - return std::make_unique(*this, ctx_id, size, flags); -} - -std::unique_ptr device::import_bo(shared_handle::export_handle ehdl) const { - return std::make_unique(*this, ehdl); -} - -std::vector device::read_aie_mem(uint16_t col, uint16_t row, - uint32_t offset, uint32_t size) { - amdxdna_drm_aie_mem mem; - std::vector store_buf(size); - - mem.col = col; - mem.row = row; - mem.addr = offset; - mem.size = size; - mem.buf_p = reinterpret_cast(store_buf.data()); - - amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_MEM, - .buffer_size = sizeof(mem), - .buffer = reinterpret_cast(&mem)}; - - m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); - - return store_buf; -} - -uint32_t device::read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr) { - amdxdna_drm_aie_reg reg; - - reg.col = col; - reg.row = row; - reg.addr = reg_addr; - reg.val = 0; - - amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_REG, - .buffer_size = sizeof(reg), - .buffer = reinterpret_cast(®)}; - - m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); - - return reg.val; -} - -size_t device::write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, - const std::vector& buf) { - amdxdna_drm_aie_mem mem; - uint32_t size = static_cast(buf.size()); - - mem.col = col; - mem.row = row; - mem.addr = offset; - mem.size = size; - mem.buf_p = reinterpret_cast(buf.data()); - - amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_MEM, - .buffer_size = sizeof(mem), - .buffer = reinterpret_cast(&mem)}; - - m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); - - return size; -} - -bool device::write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, - uint32_t reg_val) { - amdxdna_drm_aie_reg reg; - - reg.col = col; - reg.row = row; - reg.addr = reg_addr; - reg.val = reg_val; - - amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_REG, - .buffer_size = sizeof(reg), - .buffer = reinterpret_cast(®)}; - - m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); - - return true; -} - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h deleted file mode 100644 index ca4f295f8..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef PCIE_DEVICE_LINUX_XDNA_H -#define PCIE_DEVICE_LINUX_XDNA_H - -#include -#include - -#include "shared.h" -#include "shim_debug.h" - -namespace shim_xdna { - -#define XRT_NULL_HANDLE NULL - -// cuidx_type - encode cuidx and domain -// -// @domain_index: index within domain -// @domain: domain identifier -// @index: combined encoded index -// -// The domain_index is used in command cumask in exec_buf -// The combined index is used in context creation in open_context -struct cuidx_type { - union { - std::uint32_t index; - struct { - std::uint16_t domain_index; // [15-0] - std::uint16_t domain; // [31-16] - }; - }; - - // Ensure consistent use of domain and index types - using domain_type = uint16_t; - using domain_index_type = uint16_t; -}; - -struct hw_ctx; -struct pdev; -struct bo; - -struct device { - device(const pdev& pdev, void* shim_handle); - - ~device(); - - using qos_type = std::map; - enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; - std::unique_ptr alloc_bo(void* userptr, uint32_t ctx_id, size_t size, - uint64_t flags); - - // std::unique_ptr create_hw_context(const device& dev, - // const qos_type& qos) const; - - std::unique_ptr import_bo(shared_handle::export_handle ehdl) const; - - const pdev& get_pdev() const; - - std::unique_ptr alloc_bo(size_t size, uint64_t flags); - - std::unique_ptr import_bo(pid_t, shared_handle::export_handle); - - std::unique_ptr create_hw_context(const qos_type& qos, - access_mode mode) const; - - std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, - uint32_t size); - - size_t write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, - const std::vector& buf); - - uint32_t read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr); - - bool write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, - uint32_t reg_val); - - const pdev& m_pdev; // The pcidev that this device object is derived from - std::map m_bo_map; - void* m_handle = XRT_NULL_HANDLE; - - mutable std::mutex m_mutex; -}; - -} // namespace shim_xdna - -#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h deleted file mode 100644 index bce5d1623..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h +++ /dev/null @@ -1,1176 +0,0 @@ -/* - * Copyright (C) 2019-2022, Xilinx Inc - * - * This file is dual licensed. It may be redistributed and/or modified - * under the terms of the Apache 2.0 License OR version 2 of the GNU - * General Public License. - * - * Apache License Verbiage - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * GPL license Verbiage: - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. This program is - * distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public - * License for more details. You should have received a copy of the - * GNU General Public License along with this program; if not, write - * to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, - * Boston, MA 02111-1307 USA - * - */ - -/** - * DOC: XRT Embedded Runtime definition - * - * Header file *ert.h* defines data structures used by Emebdded Runtime (ERT) and - * XRT xclExecBuf() API. - */ - -#ifndef _ERT_H_ -#define _ERT_H_ - -#if defined(__linux__) && defined(__KERNEL__) -# include -#elif defined(__windows__) && defined(_KERNEL_MODE) -# include -#elif defined(__cplusplus) && !defined(_KERNEL_MODE) -# include -# include -#else -# include -# include -# include -#endif - -#ifdef _WIN32 -# pragma warning( push ) -# pragma warning( disable : 4200 4201 ) -#endif - -#if defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wpedantic" -#endif - -#define to_cfg_pkg(pkg) \ - ((struct ert_configure_cmd *)(pkg)) -#define to_start_krnl_pkg(pkg) \ - ((struct ert_start_kernel_cmd *)(pkg)) -#define to_copybo_pkg(pkg) \ - ((struct ert_start_copybo_cmd *)(pkg)) -#define to_cfg_sk_pkg(pkg) \ - ((struct ert_configure_sk_cmd *)(pkg)) -#define to_init_krnl_pkg(pkg) \ - ((struct ert_init_kernel_cmd *)(pkg)) -#define to_validate_pkg(pkg) \ - ((struct ert_validate_cmd *)(pkg)) -#define to_abort_pkg(pkg) \ - ((struct ert_abort_cmd *)(pkg)) - - -#define HOST_RW_PATTERN 0xF0F0F0F0 -#define DEVICE_RW_PATTERN 0x0F0F0F0F - -/** - * struct ert_packet: ERT generic packet format - * - * @state: [3-0] current state of a command - * @custom: [11-4] custom per specific commands - * @count: [22-12] number of words in payload (data) - * @opcode: [27-23] opcode identifying specific command - * @type: [31-28] type of command (currently 0) - * @data: count number of words representing packet payload - */ -struct ert_packet { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t custom:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-28] */ - }; - uint32_t header; - }; - uint32_t data[1]; /* count number of words */ -}; - -/** - * struct ert_start_kernel_cmd: ERT start kernel command format - * - * @state: [3-0] current state of a command - * @stat_enabled: [4] enabled driver to record timestamp for various - * states cmd has gone through. The stat data - * is appended after cmd data. - * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask - * @count: [22-12] number of words following header for cmd data. Not - * include stat data. - * @opcode: [27-23] 0, opcode for start_kernel - * @type: [31-27] 0, type of start_kernel - * - * @cu_mask: first mandatory CU mask - * @data: count-1 number of words representing interpreted payload - * - * The packet payload is comprised of reserved id field, a mandatory CU mask, - * and extra_cu_masks per header field, followed by a CU register map of size - * (count - (1 + extra_cu_masks)) uint32_t words. - */ -struct ert_start_kernel_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t stat_enabled:1; /* [4] */ - uint32_t unused:5; /* [9-5] */ - uint32_t extra_cu_masks:2; /* [11-10] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - - /* payload */ - uint32_t cu_mask; /* mandatory cu mask */ - uint32_t data[1]; /* count-1 number of words */ -}; - -/** - * struct ert_dpu_data - interpretation of data payload for ERT_START_DPU - * - * @instruction_buffer: address of instruction buffer - * @instruction_buffer_size: size of instruction buffer in bytes - * @chained: number of following ert_dpu_data elements - * - * The ert_dpu_data is prepended to data payload of ert_start_kernel_cmd - * after any extra cu masks. The payload count of the ert packet is - * incremented with the size (words) of ert_dpu_data elements - * preprended to the data payload. - * - * The data payload for ERT_START_DPU is interpreted as fixed instruction - * buffer address along with instruction count, followed by regular kernel - * arguments. - */ -struct ert_dpu_data { - uint64_t instruction_buffer; /* buffer address 2 words */ - uint32_t instruction_buffer_size; /* size of buffer in bytes */ - uint32_t chained; /* number of following ert_dpu_data elements */ -}; - -/** - * struct ert_npu_data - interpretation of data payload for ERT_START_NPU - * - * @instruction_buffer: address of instruction buffer - * @instruction_buffer_size: size of instruction buffer in bytes - * @instruction_prop_count: WORD length of property name value pairs - * - * The ert_npu_data is prepended to data payload of ert_start_kernel_cmd - * after any extra cu masks. The payload count of the ert packet is - * incremented with the size (words) of ert_npu_data elements - * preprended to the data payload. - * - * The data payload for ERT_START_NPU is interpreted as instruction - * buffer address, instruction count along with instruction property, - * followed by regular kernel arguments. - * - * When instruction_prop_count is non-zero, it indicates the length - * (in 32 bits WORD) of the instruction buffer properties after this - * fields. This count is reserved for future extension. One example - * propertiy is the number of actual columns this instruction used. - */ -struct ert_npu_data { - uint64_t instruction_buffer; /* buffer address 2 words */ - uint32_t instruction_buffer_size; /* size of buffer in bytes */ - uint32_t instruction_prop_count; /* WORD length of following properties nv pairs */ -}; - -/** - * struct ert_npu_preempt_data - interpretation of data payload for ERT_START_NPU_PREEMPT - * - * @instruction_buffer: address of instruction buffer - * @save_buffer: address of save instruction buffer - * @restore_buffer: address of restrore instruction buffer - * @instruction_buffer_size: size of instruction buffer in bytes - * @save_buffer_size: size of save instruction buffer in bytes - * @restore_buffer_size: size of restore instruction buffer in bytes - * @instruction_prop_count: number of property name value pairs - * - * The ert_npu_preempt_data is prepended to data payload of ert_start_kernel_cmd - * after any extra cu masks. The payload count of the ert packet is - * incremented with the size (words) of ert_npu_preempt_data elements - * preprended to the data payload. - * - * The data payload for ERT_START_NPU_PREEMPT is interpreted as instruction - * buffer, save instruction buffer, restore instruction buffer and their - * size, along with instruction property, followed by regular kernel arguments. - * - * When instruction_prop_count is non-zero, it indicates the length - * (in 32 bits WORD) of the instruction buffer properties after this - * fields. This count is reserved for future extension. One example - * propertiy is the number of actual columns this instruction used. - */ -struct ert_npu_preempt_data { - uint64_t instruction_buffer; /* buffer address 2 words */ - uint64_t save_buffer; /* buffer address 2 words */ - uint64_t restore_buffer; /* buffer address 2 words */ - uint32_t instruction_buffer_size; /* size of buffer in bytes */ - uint32_t save_buffer_size; /* size of buffer in bytes */ - uint32_t restore_buffer_size; /* size of buffer in bytes */ - uint32_t instruction_prop_count; /* DWORD length of following properties nv pairs */ -}; - -/** - * struct ert_cmd_chain_data - interpretation of data payload for ERT_CMD_CHAIN - * - * @command_count: number of commands in chain - * @submit_index: index of last successfully submitted command in chain - * @error_index: index of failing command if cmd status is not completed - * @data[]: address of each command in chain - * - * This is the payload of an *ert_packet* when the opcode is ERT_CMD_CHAIN - */ -struct ert_cmd_chain_data { - uint32_t command_count; - uint32_t submit_index; - uint32_t error_index; - uint32_t reserved[3]; - uint64_t data[]; -}; - -#ifndef U30_DEBUG -#define ert_write_return_code(cmd, value) \ -do { \ - struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ - int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ - skcmd->data[end_idx] = value; \ -} while (0) - -#define ert_read_return_code(cmd, ret) \ -do { \ - struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ - int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ - ret = skcmd->data[end_idx]; \ -} while (0) -#else -/* These are for debug legacy U30 firmware */ -#define ert_write_return_code(cmd, value) \ -do { \ - struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ - skcmd->cu_mask = value; \ -} while (0) - -#define ert_read_return_code(cmd, ret) \ -do { \ - struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ - ret = skcmd->cu_mask; \ -} while (0) -#endif - -/** - * struct ert_init_kernel_cmd: ERT initialize kernel command format - * this command initializes CUs by writing CU registers. CUs are - * represented by cu_mask and extra_cu_masks. - * - * @state: [3-0] current state of a command - * @update_rtp: [4] command is for runtime update of cu argument - * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask - * @count: [22-12] number of words following header - * @opcode: [27-23] 0, opcode for init_kernel - * @type: [31-27] 0, type of init_kernel - * - * @cu_run_timeout the configured CU timeout value in Microseconds - * setting to 0 means CU should not timeout - * @cu_reset_timeout the configured CU reset timeout value in Microseconds - * when CU timeout, CU will be reset. this indicates - * CU reset should be completed within the timeout value. - * if cu_run_timeout is set to 0, this field is undefined. - * - * @cu_mask: first mandatory CU mask - * @data: count-9 number of words representing interpreted payload - * - * The packet payload is comprised of reserved id field, 8 reserved fields, - * a mandatory CU mask, and extra_cu_masks per header field, followed by a - * CU register map of size (count - (9 + extra_cu_masks)) uint32_t words. - */ -struct ert_init_kernel_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t update_rtp:1; /* [4] */ - uint32_t unused:5; /* [9-5] */ - uint32_t extra_cu_masks:2; /* [11-10] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - - uint32_t cu_run_timeout; /* CU timeout value in Microseconds */ - uint32_t cu_reset_timeout; /* CU reset timeout value in Microseconds */ - uint32_t reserved[6]; /* reserved for future use */ - - /* payload */ - uint32_t cu_mask; /* mandatory cu mask */ - uint32_t data[1]; /* count-9 number of words */ -}; - -#define KDMA_BLOCK_SIZE 64 /* Limited by KDMA CU */ -struct ert_start_copybo_cmd { - uint32_t state:4; /* [3-0], must be ERT_CMD_STATE_NEW */ - uint32_t unused:6; /* [9-4] */ - uint32_t extra_cu_masks:2; /* [11-10], = 3 */ - uint32_t count:11; /* [22-12], = 16, exclude 'arg' */ - uint32_t opcode:5; /* [27-23], = ERT_START_COPYBO */ - uint32_t type:4; /* [31-27], = ERT_DEFAULT */ - uint32_t cu_mask[4]; /* mandatory cu masks */ - uint32_t reserved[4]; /* for scheduler use */ - uint32_t src_addr_lo; /* low 32 bit of src addr */ - uint32_t src_addr_hi; /* high 32 bit of src addr */ - uint32_t src_bo_hdl; /* src bo handle, cleared by driver */ - uint32_t dst_addr_lo; /* low 32 bit of dst addr */ - uint32_t dst_addr_hi; /* high 32 bit of dst addr */ - uint32_t dst_bo_hdl; /* dst bo handle, cleared by driver */ - uint32_t size; /* size in bytes low 32 bit*/ - uint32_t size_hi; /* size in bytes high 32 bit*/ - void *arg; /* pointer to aux data for KDS */ -}; - -/** - * struct ert_configure_cmd: ERT configure command format - * - * @state: [3-0] current state of a command - * @count: [22-12] number of words in payload (5 + num_cus) - * @opcode: [27-23] 1, opcode for configure - * @type: [31-27] 0, type of configure - * - * @slot_size: command queue slot size - * @num_cus: number of compute units in program - * @cu_shift: shift value to convert CU idx to CU addr - * @cu_base_addr: base address to add to CU addr for actual physical address - * - * @ert:1 enable embedded HW scheduler - * @polling:1 poll for command completion - * @cu_dma:1 enable CUDMA custom module for HW scheduler - * @cu_isr:1 enable CUISR custom module for HW scheduler - * @cq_int:1 enable interrupt from host to HW scheduler - * @cdma:1 enable CDMA kernel - * @unused:25 - * @dsa52:1 reserved for internal use - * - * @data: addresses of @num_cus CUs - */ -struct ert_configure_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t unused:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - - /* payload */ - uint32_t slot_size; - uint32_t num_cus; - uint32_t cu_shift; - uint32_t cu_base_addr; - - /* features */ - uint32_t ert:1; - uint32_t polling:1; - uint32_t cu_dma:1; - uint32_t cu_isr:1; - uint32_t cq_int:1; - uint32_t cdma:1; - uint32_t dataflow:1; - /* WORKAROUND: allow xclRegWrite/xclRegRead access shared CU */ - uint32_t rw_shared:1; - uint32_t kds_30:1; - uint32_t dmsg:1; - uint32_t echo:1; - uint32_t intr:1; - uint32_t unusedf:19; - uint32_t dsa52:1; - - /* cu address map size is num_cus */ - uint32_t data[1]; -}; - -/* - * Note: We need to put maximum 128 soft kernel image - * in one config command (1024 DWs including header). - * So each one needs to be smaller than 8 DWs. - * - * This data struct is obsoleted. Only used in legacy ERT firmware. - * Use 'struct config_sk_image_uuid' instead on XGQ based ERT. - * - * @start_cuidx: start index of compute units of each image - * @num_cus: number of compute units of each image - * @sk_name: symbol name of soft kernel of each image - */ -struct config_sk_image { - uint32_t start_cuidx; - uint32_t num_cus; - uint32_t sk_name[5]; -}; - -/* - * Note: We need to put maximum 128 soft kernel image - * in one config command (1024 DWs including header). - * So each one needs to be smaller than 8 DWs. - * - * @start_cuidx: start index of compute units of each image - * @num_cus: number of compute units of each image - * @sk_name: symbol name of soft kernel of each image - * @sk_uuid: xclbin uuid that this soft kernel image belones to - */ -struct config_sk_image_uuid { - uint32_t start_cuidx; - uint32_t num_cus; - uint32_t sk_name[5]; - unsigned char sk_uuid[16]; - uint32_t uint32_t; -}; - -/** - * struct ert_configure_sk_cmd: ERT configure soft kernel command format - * - * @state: [3-0] current state of a command - * @count: [22-12] number of words in payload - * @opcode: [27-23] 1, opcode for configure - * @type: [31-27] 0, type of configure - * - * @num_image: number of images -*/ -struct ert_configure_sk_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t unused:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - - /* payload */ - uint32_t num_image; - struct config_sk_image image[1]; -}; - -/** - * struct ert_unconfigure_sk_cmd: ERT unconfigure soft kernel command format - * - * @state: [3-0] current state of a command - * @count: [22-12] number of words in payload - * @opcode: [27-23] 1, opcode for configure - * @type: [31-27] 0, type of configure - * - * @start_cuidx: start index of compute units - * @num_cus: number of compute units in program - */ -struct ert_unconfigure_sk_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t unused:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - - /* payload */ - uint32_t start_cuidx; - uint32_t num_cus; -}; - -/** - * struct ert_abort_cmd: ERT abort command format. - * - * @exec_bo_handle: The bo handle of execbuf command to abort - */ -struct ert_abort_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t custom:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - - /* payload */ - uint64_t exec_bo_handle; -}; - -/** - * struct ert_validate_cmd: ERT BIST command format. - * - */ -struct ert_validate_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t custom:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - uint32_t timestamp; - uint32_t cq_read_single; - uint32_t cq_write_single; - uint32_t cu_read_single; - uint32_t cu_write_single; -}; - -/** - * struct ert_validate_cmd: ERT BIST command format. - * - */ -struct ert_access_valid_cmd { - union { - struct { - uint32_t state:4; /* [3-0] */ - uint32_t custom:8; /* [11-4] */ - uint32_t count:11; /* [22-12] */ - uint32_t opcode:5; /* [27-23] */ - uint32_t type:4; /* [31-27] */ - }; - uint32_t header; - }; - uint32_t h2h_access; - uint32_t h2d_access; - uint32_t d2h_access; - uint32_t d2d_access; - uint32_t d2cu_access; - uint32_t wr_count; - uint32_t wr_test; -}; - -/** - * ERT command state - * - * @ERT_CMD_STATE_NEW: Set by host before submitting a command to - * scheduler - * @ERT_CMD_STATE_QUEUED: Internal scheduler state - * @ERT_CMD_STATE_SUBMITTED: Internal scheduler state - * @ERT_CMD_STATE_RUNNING: Internal scheduler state - * @ERT_CMD_STATE_COMPLETED: Set by scheduler when command completes - * @ERT_CMD_STATE_ERROR: Set by scheduler if command failed - * @ERT_CMD_STATE_ABORT: Set by scheduler if command abort - * @ERT_CMD_STATE_TIMEOUT: Set by scheduler if command timeout and reset - * @ERT_CMD_STATE_NORESPONSE: Set by scheduler if command timeout and fail to - * reset - */ -enum ert_cmd_state { - ERT_CMD_STATE_NEW = 1, - ERT_CMD_STATE_QUEUED = 2, - ERT_CMD_STATE_RUNNING = 3, - ERT_CMD_STATE_COMPLETED = 4, - ERT_CMD_STATE_ERROR = 5, - ERT_CMD_STATE_ABORT = 6, - ERT_CMD_STATE_SUBMITTED = 7, - ERT_CMD_STATE_TIMEOUT = 8, - ERT_CMD_STATE_NORESPONSE = 9, - ERT_CMD_STATE_SKERROR = 10, //Check for error return code from Soft Kernel - ERT_CMD_STATE_SKCRASHED = 11, //Soft kernel has crashed - ERT_CMD_STATE_MAX, // Always the last one -}; - -struct cu_cmd_state_timestamps { - uint64_t skc_timestamps[ERT_CMD_STATE_MAX]; // In nano-second -}; - -/** - * Opcode types for commands - * - * @ERT_START_CU: start a workgroup on a CU - * @ERT_START_KERNEL: currently aliased to ERT_START_CU - * @ERT_CONFIGURE: configure command scheduler - * @ERT_EXEC_WRITE: execute a specified CU after writing - * @ERT_CU_STAT: get stats about CU execution - * @ERT_START_COPYBO: start KDMA CU or P2P, may be converted to ERT_START_CU - * before cmd reach to scheduler, short-term hack - * @ERT_SK_CONFIG: configure soft kernel - * @ERT_SK_START: start a soft kernel - * @ERT_SK_UNCONFIG: unconfigure a soft kernel - * @ERT_START_KEY_VAL: same as ERT_START_CU but with key-value pair flavor - * @ERT_START_DPU: instruction buffer command format - * @ERT_CMD_CHAIN: command chain - * @ERT_START_NPU: instruction buffer command format on NPU format - * @ERT_START_NPU_PREEMPT: instruction buffer command with preemption format on NPU - */ -enum ert_cmd_opcode { - ERT_START_CU = 0, - ERT_START_KERNEL = 0, - ERT_CONFIGURE = 2, - ERT_EXIT = 3, - ERT_ABORT = 4, - ERT_EXEC_WRITE = 5, - ERT_CU_STAT = 6, - ERT_START_COPYBO = 7, - ERT_SK_CONFIG = 8, - ERT_SK_START = 9, - ERT_SK_UNCONFIG = 10, - ERT_INIT_CU = 11, - ERT_START_FA = 12, - ERT_CLK_CALIB = 13, - ERT_MB_VALIDATE = 14, - ERT_START_KEY_VAL = 15, - ERT_ACCESS_TEST_C = 16, - ERT_ACCESS_TEST = 17, - ERT_START_DPU = 18, - ERT_CMD_CHAIN = 19, - ERT_START_NPU = 20, - ERT_START_NPU_PREEMPT = 21, -}; - -/** - * Command types - * - * @ERT_DEFAULT: default command type - * @ERT_KDS_LOCAL: command processed by KDS locally - * @ERT_CTRL: control command uses reserved command queue slot - * @ERT_CU: compute unit command - */ -enum ert_cmd_type { - ERT_DEFAULT = 0, - ERT_KDS_LOCAL = 1, - ERT_CTRL = 2, - ERT_CU = 3, - ERT_SCU = 4, -}; - -/** - * Soft kernel types - * - * @SOFTKERNEL_TYPE_EXEC: executable - */ -enum softkernel_type { - SOFTKERNEL_TYPE_EXEC = 0, -}; - -/* - * Base address GPIO per spec - * | Offset | Description - * ----------------------- - * | 0x00 | ERT_MGMT_PF_base_addr (Not sure where this should be use) - * | 0x08 | ERT_USER_PF_base_addr. The base address of ERT peripherals - */ -#if defined(ERT_BUILD_V20) -uint32_t ert_base_addr = 0; -# define ERT_BASE_ADDR 0x01F30008 -#endif - -#if defined(ERT_BUILD_V30) -uint32_t ert_base_addr = 0; -# define ERT_BASE_ADDR 0x01F30008 -#endif - -/** - * Address constants per spec - */ -#define ERT_WORD_SIZE 4 /* 4 bytes */ -#define ERT_CQ_SIZE 0x10000 /* 64K */ -#if defined(ERT_BUILD_U50) -# define ERT_CQ_BASE_ADDR 0x340000 -# define ERT_CSR_ADDR 0x360000 -#elif defined(ERT_BUILD_V20) -# define ERT_CQ_BASE_ADDR (0x000000 + ert_base_addr) -# define ERT_CSR_ADDR (0x010000 + ert_base_addr) -#elif defined(ERT_BUILD_V30) -# define ERT_CQ_BASE_ADDR 0x1F60000 -# define ERT_CSR_ADDR (0x010000 + ert_base_addr) -#else -# define ERT_CQ_BASE_ADDR 0x190000 -# define ERT_CSR_ADDR 0x180000 -#endif - -/** - * The STATUS REGISTER is for communicating completed CQ slot indices - * MicroBlaze write, host reads. MB(W) / HOST(COR) - */ -#define ERT_STATUS_REGISTER_ADDR (ERT_CSR_ADDR) -#define ERT_STATUS_REGISTER_ADDR0 (ERT_CSR_ADDR) -#define ERT_STATUS_REGISTER_ADDR1 (ERT_CSR_ADDR + 0x4) -#define ERT_STATUS_REGISTER_ADDR2 (ERT_CSR_ADDR + 0x8) -#define ERT_STATUS_REGISTER_ADDR3 (ERT_CSR_ADDR + 0xC) - -/** - * The CU DMA REGISTER is for communicating which CQ slot is to be started - * on a specific CU. MB selects a free CU on which the command can - * run, then writes the 1<state = ERT_CMD_STATE_NEW; - pkt->extra_cu_masks = 3; - pkt->count = 16; - pkt->opcode = ERT_START_COPYBO; - pkt->type = ERT_DEFAULT; - pkt->cu_mask[0] = 0; - pkt->cu_mask[1] = 0; - pkt->cu_mask[2] = 0; - pkt->cu_mask[3] = 0; - pkt->src_addr_lo = (uint32_t)src_offset; - pkt->src_addr_hi = (src_offset >> 32) & 0xFFFFFFFF; - pkt->src_bo_hdl = src_bo; - pkt->dst_addr_lo = (uint32_t)dst_offset; - pkt->dst_addr_hi = (dst_offset >> 32) & 0xFFFFFFFF; - pkt->dst_bo_hdl = dst_bo; - pkt->size = size; - pkt->size_hi = 0; /* set to 0 explicitly */ - pkt->arg = 0; -} -static inline uint64_t -ert_copybo_src_offset(struct ert_start_copybo_cmd *pkt) -{ - return (uint64_t)pkt->src_addr_hi << 32 | pkt->src_addr_lo; -} -static inline uint64_t -ert_copybo_dst_offset(struct ert_start_copybo_cmd *pkt) -{ - return (uint64_t)pkt->dst_addr_hi << 32 | pkt->dst_addr_lo; -} -static inline uint64_t -ert_copybo_size(struct ert_start_copybo_cmd *pkt) -{ - return pkt->size; -} - -static inline bool -ert_valid_opcode(struct ert_packet *pkt) -{ - struct ert_start_kernel_cmd *skcmd; - struct ert_init_kernel_cmd *ikcmd; - struct ert_start_copybo_cmd *sccmd; - struct ert_configure_cmd *ccmd; - struct ert_configure_sk_cmd *cscmd; - struct ert_cmd_chain_data *ccdata; - bool valid; - - switch (pkt->opcode) { - case ERT_START_CU: - skcmd = to_start_krnl_pkg(pkt); - /* 1 cu mask + 4 registers */ - valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 4); - break; - case ERT_START_DPU: - skcmd = to_start_krnl_pkg(pkt); - /* 1 mandatory cumask + extra_cu_masks + size (in words) of ert_dpu_data */ - valid = (skcmd->count >= 1+ skcmd->extra_cu_masks + sizeof(struct ert_dpu_data) / sizeof(uint32_t)); - break; - case ERT_CMD_CHAIN: - ccdata = (struct ert_cmd_chain_data*) pkt->data; - /* header count must match number of commands in payload */ - valid = (pkt->count == (ccdata->command_count * sizeof(uint64_t) + sizeof(struct ert_cmd_chain_data)) / sizeof(uint32_t)); - break; - case ERT_START_NPU: - skcmd = to_start_krnl_pkg(pkt); - /* 1 mandatory cumask + extra_cu_masks + ert_npu_data */ - valid = (skcmd->count >= 1+ skcmd->extra_cu_masks + sizeof(struct ert_npu_data) / sizeof(uint32_t)); - break; - case ERT_START_NPU_PREEMPT: - skcmd = to_start_krnl_pkg(pkt); - /* 1 mandatory cumask + extra_cu_masks + ert_npu_preempt_data */ - valid = (skcmd->count >= 1+ skcmd->extra_cu_masks + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t)); - break; - case ERT_START_KEY_VAL: - skcmd = to_start_krnl_pkg(pkt); - /* 1 cu mask */ - valid = (skcmd->count >= skcmd->extra_cu_masks + 1); - break; - case ERT_EXEC_WRITE: - skcmd = to_start_krnl_pkg(pkt); - /* 1 cu mask + 6 registers */ - valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 6); - break; - case ERT_START_FA: - skcmd = to_start_krnl_pkg(pkt); - /* 1 cu mask */ - valid = (skcmd->count >= skcmd->extra_cu_masks + 1); - break; - case ERT_SK_START: - skcmd = to_start_krnl_pkg(pkt); - /* 1 cu mask + 1 control word */ - valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 1); - break; - case ERT_CONFIGURE: - ccmd = to_cfg_pkg(pkt); - /* 5 mandatory fields in struct */ - valid = (ccmd->count >= 5 + ccmd->num_cus); - break; - case ERT_START_COPYBO: - sccmd = to_copybo_pkg(pkt); - valid = (sccmd->count == 16); - break; - case ERT_INIT_CU: - ikcmd = to_init_krnl_pkg(pkt); - /* 9 mandatory words in struct + 4 control registers */ - valid = (ikcmd->count >= ikcmd->extra_cu_masks + 9 + 4); - break; - case ERT_SK_CONFIG: - cscmd = to_cfg_sk_pkg(pkt); - valid = (cscmd->count == sizeof(struct config_sk_image) * cscmd->num_image / 4 + 1); - break; - case ERT_CLK_CALIB: - case ERT_MB_VALIDATE: - case ERT_ACCESS_TEST_C: - case ERT_CU_STAT: /* TODO: Rules to validate? */ - case ERT_EXIT: - case ERT_ABORT: - valid = true; - break; - case ERT_SK_UNCONFIG: /* NOTE: obsolete */ - default: - valid = false; - } - - return valid; -} - -static inline uint64_t -get_ert_packet_size_bytes(struct ert_packet *pkt) -{ - // header plus payload - return sizeof(pkt->header) + pkt->count * sizeof(uint32_t); -} - -static inline struct ert_dpu_data* -get_ert_dpu_data(struct ert_start_kernel_cmd* pkt) -{ - if (pkt->opcode != ERT_START_DPU) - return NULL; - - // past extra cu_masks embedded in the packet data - return (struct ert_dpu_data*) (pkt->data + pkt->extra_cu_masks); -} - -static inline struct ert_dpu_data* -get_ert_dpu_data_next(struct ert_dpu_data* dpu_data) -{ - if (dpu_data->chained == 0) - return NULL; - - return dpu_data + 1; -} - -static inline struct ert_cmd_chain_data* -get_ert_cmd_chain_data(struct ert_packet* pkt) -{ - if (pkt->opcode != ERT_CMD_CHAIN) - return NULL; - - return (struct ert_cmd_chain_data*) pkt->data; -} - -static inline struct ert_npu_data* -get_ert_npu_data(struct ert_start_kernel_cmd* pkt) -{ - if (pkt->opcode != ERT_START_NPU) - return NULL; - - // past extra cu_masks embedded in the packet data - return (struct ert_npu_data*) (pkt->data + pkt->extra_cu_masks); -} - -static inline struct ert_npu_preempt_data* -get_ert_npu_preempt_data(struct ert_start_kernel_cmd* pkt) -{ - if (pkt->opcode != ERT_START_NPU_PREEMPT) - return NULL; - - // past extra cu_masks embedded in the packet data - return (struct ert_npu_preempt_data*) (pkt->data + pkt->extra_cu_masks); -} - -static inline uint32_t* -get_ert_regmap_begin(struct ert_start_kernel_cmd* pkt) -{ - switch (pkt->opcode) { - case ERT_START_DPU: - return pkt->data + pkt->extra_cu_masks - + (get_ert_dpu_data(pkt)->chained + 1) * sizeof(struct ert_dpu_data) / sizeof(uint32_t); - - case ERT_START_NPU: - return pkt->data + pkt->extra_cu_masks - + sizeof(struct ert_npu_data) / sizeof(uint32_t) - + get_ert_npu_data(pkt)->instruction_prop_count; - - case ERT_START_NPU_PREEMPT: - return pkt->data + pkt->extra_cu_masks - + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t) - + get_ert_npu_preempt_data(pkt)->instruction_prop_count; - - default: - // skip past embedded extra cu_masks - return pkt->data + pkt->extra_cu_masks; - } -} - -static inline uint32_t* -get_ert_regmap_end(struct ert_start_kernel_cmd* pkt) -{ - // pkt->count includes the mandatory cumask which precededs data array - return &pkt->cu_mask + pkt->count; -} - -static inline uint64_t -get_ert_regmap_size_bytes(struct ert_start_kernel_cmd* pkt) -{ - return (get_ert_regmap_end(pkt) - get_ert_regmap_begin(pkt)) * sizeof(uint32_t); -} - -#ifdef __linux__ -#define P2ROUNDUP(x, align) (-(-(x) & -(align))) -static inline struct cu_cmd_state_timestamps * -ert_start_kernel_timestamps(struct ert_start_kernel_cmd *pkt) -{ - uint64_t offset = pkt->count * sizeof(uint32_t) + sizeof(pkt->header); - /* Make sure the offset of timestamps are properly aligned. */ - return (struct cu_cmd_state_timestamps *) - ((char *)pkt + P2ROUNDUP(offset, sizeof(uint64_t))); -} - -/* Return 0 if this pkt doesn't support timestamp or disabled */ -static inline int -get_size_with_timestamps_or_zero(struct ert_packet *pkt) -{ - struct ert_start_kernel_cmd *skcmd; - int size = 0; - - switch (pkt->opcode) { - case ERT_START_CU: - case ERT_EXEC_WRITE: - case ERT_START_FA: - case ERT_SK_START: - skcmd = to_start_krnl_pkg(pkt); - if (skcmd->stat_enabled) { - size = (char *)ert_start_kernel_timestamps(skcmd) - (char *)pkt; - size += sizeof(struct cu_cmd_state_timestamps); - } - } - - return size; -} -#endif - -#if defined(__GNUC__) -# pragma GCC diagnostic pop -#endif - -#ifdef _WIN32 -# pragma warning( pop ) -#endif - -#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp deleted file mode 100644 index 850e4198a..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp +++ /dev/null @@ -1,221 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "fence.h" - -#include - -#include "amdxdna_accel.h" -#include "fence.h" -#include "hwctx.h" -#include "pcidev.h" - -namespace { - -uint32_t create_syncobj(const shim_xdna::pdev& dev) { - drm_syncobj_create csobj = {.handle = AMDXDNA_INVALID_FENCE_HANDLE, - .flags = 0}; - dev.ioctl(DRM_IOCTL_SYNCOBJ_CREATE, &csobj); - return csobj.handle; -} - -void destroy_syncobj(const shim_xdna::pdev& dev, uint32_t hdl) { - drm_syncobj_destroy dsobj = {.handle = hdl}; - dev.ioctl(DRM_IOCTL_SYNCOBJ_DESTROY, &dsobj); -} - -uint64_t query_syncobj_timeline(const shim_xdna::pdev& dev, uint32_t sobj_hdl) { - uint64_t point = 0; - drm_syncobj_timeline_array sobjs = { - .handles = reinterpret_cast(&sobj_hdl), - .points = reinterpret_cast(&point), - .count_handles = 1, - .flags = 0}; - dev.ioctl(DRM_IOCTL_SYNCOBJ_QUERY, &sobjs); - return point; -} - -int export_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl) { - drm_syncobj_handle esobj = { - .handle = sobj_hdl, - .flags = 0, - .fd = -1, - }; - dev.ioctl(DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &esobj); - return esobj.fd; -} - -uint32_t import_syncobj(const shim_xdna::pdev& dev, int fd) { - drm_syncobj_handle isobj = { - .handle = AMDXDNA_INVALID_FENCE_HANDLE, - .flags = 0, - .fd = fd, - }; - dev.ioctl(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &isobj); - return isobj.handle; -} - -void signal_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl, - uint64_t timepoint) { - drm_syncobj_timeline_array sobjs = { - .handles = reinterpret_cast(&sobj_hdl), - .points = reinterpret_cast(&timepoint), - .count_handles = 1, - .flags = 0}; - dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &sobjs); -} - -void wait_syncobj_done(const shim_xdna::pdev& dev, uint32_t sobj_hdl, - uint64_t timepoint) { - drm_syncobj_timeline_wait wsobj = { - .handles = reinterpret_cast(&sobj_hdl), - .points = reinterpret_cast(&timepoint), - .timeout_nsec = std::numeric_limits::max(), /* wait forever */ - .count_handles = 1, - .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, - }; - dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); -} - -void wait_syncobj_available(const shim_xdna::pdev& dev, - const uint32_t* sobj_hdls, - const uint64_t* timepoints, uint32_t num) { - drm_syncobj_timeline_wait wsobj = { - .handles = reinterpret_cast(sobj_hdls), - .points = reinterpret_cast(timepoints), - .timeout_nsec = std::numeric_limits::max(), /* wait forever */ - .count_handles = num, - .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL | - DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | - DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE, - }; - dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); -} - -void submit_wait_syncobjs(const shim_xdna::pdev& dev, - const shim_xdna::hw_ctx* ctx, - const uint32_t* sobj_hdls, const uint64_t* points, - uint32_t num) { - wait_syncobj_available(dev, sobj_hdls, points, num); - - amdxdna_drm_exec_cmd ecmd = { - .hwctx = ctx->get_slotidx(), - .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY, - .cmd_handles = reinterpret_cast(sobj_hdls), - .args = reinterpret_cast(points), - .cmd_count = num, - .arg_count = num, - }; - dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); -} - -void submit_signal_syncobj(const shim_xdna::pdev& dev, - const shim_xdna::hw_ctx* ctx, uint32_t sobj_hdl, - uint64_t point) { - amdxdna_drm_exec_cmd ecmd = { - .hwctx = ctx->get_slotidx(), - .type = AMDXDNA_CMD_SUBMIT_SIGNAL, - .cmd_handles = sobj_hdl, - .args = point, - .cmd_count = 1, - .arg_count = 1, - }; - dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); -} - -} // namespace - -namespace shim_xdna { - -fence::fence(const device& device) - : m_pdev(device.get_pdev()), - m_import(std::make_unique(-1)), - m_syncobj_hdl(create_syncobj(m_pdev)) { - shim_debug("Fence allocated: %d@%d", m_syncobj_hdl, m_state); -} - -fence::fence(const device& device, shared_handle::export_handle ehdl) - : m_pdev(device.get_pdev()), - m_import(std::make_unique(ehdl)), - m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())) { - shim_debug("Fence imported: %d@%ld", m_syncobj_hdl, m_state); -} - -fence::~fence() { - shim_debug("Fence going away: %d@%ld", m_syncobj_hdl, m_state); - destroy_syncobj(m_pdev, m_syncobj_hdl); -} - -std::unique_ptr fence::share() const { - if (m_state != initial_state) - shim_err(-EINVAL, "Can't share fence not at initial state."); - - return std::make_unique(export_syncobj(m_pdev, m_syncobj_hdl)); -} - -uint64_t fence::get_next_state() const { return m_state + 1; } - -uint64_t fence::wait_next_state() const { - std::lock_guard guard(m_lock); - - if (m_state != initial_state && m_signaled) - shim_err(-EINVAL, "Can't wait on fence that has been signaled before."); - return ++m_state; -} - -// Timeout value is ignored for now. -void fence::wait(uint32_t timeout_ms) const { - auto st = signal_next_state(); - shim_debug("Waiting for command fence %d@%ld", m_syncobj_hdl, st); - wait_syncobj_done(m_pdev, m_syncobj_hdl, st); -} - -void fence::submit_wait(const hw_ctx* ctx) const { - auto st = signal_next_state(); - shim_debug("Submitting wait for command fence %d@%ld", m_syncobj_hdl, st); - submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1); -} - -uint64_t fence::signal_next_state() const { - std::lock_guard guard(m_lock); - - if (m_state != initial_state && !m_signaled) - shim_err(-EINVAL, "Can't signal fence that has been waited before."); - if (m_state == initial_state) m_signaled = true; - return ++m_state; -} - -void fence::signal() const { - auto st = signal_next_state(); - shim_debug("Signaling command fence %d@%ld", m_syncobj_hdl, st); - signal_syncobj(m_pdev, m_syncobj_hdl, st); -} - -void fence::submit_signal(const hw_ctx* ctx) const { - auto st = signal_next_state(); - shim_debug("Submitting signal command fence %d@%ld", m_syncobj_hdl, st); - submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st); -} - -void fence::submit_wait(const pdev& dev, const hw_ctx* ctx, - const std::vector& fences) { - constexpr int max_fences = 1024; - uint32_t hdls[max_fences]; - uint64_t pts[max_fences]; - int i = 0; - - if (fences.size() > max_fences) - shim_err(-EINVAL, "Too many fences in one submit: %d", fences.size()); - - for (auto f : fences) { - auto fh = static_cast(f); - auto st = fh->wait_next_state(); - shim_debug("Waiting for command fence %d@%ld", fh->m_syncobj_hdl, st); - hdls[i] = fh->m_syncobj_hdl; - pts[i] = st; - i++; - } - submit_wait_syncobjs(dev, ctx, hdls, pts, i); -} - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h deleted file mode 100644 index a5acd4c1d..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef _FENCE_XDNA_H_ -#define _FENCE_XDNA_H_ - -#include - -#include "shared.h" -#include "shim_debug.h" - -namespace shim_xdna { - -struct device; -struct hw_ctx; -struct pdev; - -struct fence { - using export_handle = shared_handle::export_handle; - enum class access_mode : uint8_t { local, shared, process, hybrid }; - - fence(const device& device); - - fence(const device& device, shared_handle::export_handle ehdl); - - ~fence(); - - std::unique_ptr share() const; - - void wait(uint32_t timeout_ms) const; - - uint64_t get_next_state() const; - - void signal() const; - - void submit_wait(const hw_ctx*) const; - - static void submit_wait(const pdev& dev, const hw_ctx*, - const std::vector& fences); - - void submit_signal(const hw_ctx*) const; - - uint64_t wait_next_state() const; - - uint64_t signal_next_state() const; - - const pdev& m_pdev; - const std::unique_ptr m_import; - uint32_t m_syncobj_hdl; - - // Protecting below mutables - mutable std::mutex m_lock; - // Set once at first signal - mutable bool m_signaled = false; - // Ever incrementing at each wait/signal - static constexpr uint64_t initial_state = 0; - mutable uint64_t m_state = initial_state; -}; - -} // namespace shim_xdna - -#endif // _FENCE_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp deleted file mode 100644 index 23dd3b728..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "hwctx.h" - -#include "bo.h" -#include "device.h" -#include "hwq.h" -#include "pcidev.h" - -namespace shim_xdna { - -hw_ctx::hw_ctx(const device& dev, const qos_type& qos, std::unique_ptr q) - : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { - shim_debug("Creating HW context..."); - init_qos_info(qos); -} - -hw_ctx::~hw_ctx() { - delete_ctx_on_device(); - shim_debug("Destroyed HW context (%d)...", m_handle); -} - -uint32_t hw_ctx::get_slotidx() const { return m_handle; } - -void hw_ctx::set_slotidx(uint32_t id) { m_handle = id; } - -cuidx_type hw_ctx::open_cu_context(const std::string& cu_name) { - for (uint32_t i = 0; i < m_cu_info.size(); i++) { - auto& ci = m_cu_info[i]; - if (ci.m_name == cu_name) return cuidx_type{.index = i}; - } - - shim_err(ENOENT, "CU name (%s) not found", cu_name.c_str()); -} - -void hw_ctx::close_cu_context(cuidx_type cuidx) { - // Nothing to be done -} - -std::unique_ptr hw_ctx::alloc_bo(size_t size, uint64_t flags) { - return alloc_bo(nullptr, size, flags); -} - -std::unique_ptr hw_ctx::import_bo(pid_t pid, - shared_handle::export_handle ehdl) { - // const_cast: import_bo() is not const yet in device class - auto& dev = const_cast(get_device()); - return dev.import_bo(pid, ehdl); -} - -hw_q* hw_ctx::get_hw_queue() { return m_q.get(); } - -void hw_ctx::init_qos_info(const qos_type& qos) { - for (auto& [key, value] : qos) { - if (key == "gops") - m_qos.gops = value; - else if (key == "fps") - m_qos.fps = value; - else if (key == "dma_bandwidth") - m_qos.dma_bandwidth = value; - else if (key == "latency") - m_qos.latency = value; - else if (key == "frame_execution_time") - m_qos.frame_exec_time = value; - else if (key == "priority") - m_qos.priority = value; - } -} - -const device& hw_ctx::get_device() { return m_device; } - -const std::vector& hw_ctx::get_cu_info() const { - return m_cu_info; -} - -void hw_ctx::create_ctx_on_device() { - amdxdna_drm_create_hwctx arg = {}; - arg.qos_p = reinterpret_cast(&m_qos); - arg.umq_bo = m_q->get_queue_bo(); - arg.max_opc = m_ops_per_cycle; - // arg.num_tiles = - // m_num_cols * - // xrt_core::device_query(&m_device) - // .core_rows; - arg.log_buf_bo = m_log_bo - ? static_cast(m_log_bo.get())->get_drm_bo_handle() - : AMDXDNA_INVALID_BO_HANDLE; - m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &arg); - - set_slotidx(arg.handle); - set_doorbell(arg.umq_doorbell); - - m_q->bind_hwctx(this); -} - -void hw_ctx::delete_ctx_on_device() { - if (m_handle == AMDXDNA_INVALID_CTX_HANDLE) return; - - m_q->unbind_hwctx(); - struct amdxdna_drm_destroy_hwctx arg = {}; - arg.handle = m_handle; - m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &arg); - - fini_log_buf(); -} - -void hw_ctx::init_log_buf() { - auto log_buf_size = m_num_cols * 1024; - m_log_bo = alloc_bo(nullptr, log_buf_size, XCL_BO_FLAGS_EXECBUF); - m_log_buf = m_log_bo->map(bo::map_type::write); - std::memset(m_log_buf, 0, log_buf_size); -} - -void hw_ctx::fini_log_buf(void) { - if (m_log_bo) m_log_bo->unmap(m_log_buf); -} - -void hw_ctx::set_doorbell(uint32_t db) { m_doorbell = db; } - -uint32_t hw_ctx::get_doorbell() const { return m_doorbell; } - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h deleted file mode 100644 index 9fec8cc6f..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ /dev/null @@ -1,89 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef _HWCTX_XDNA_H_ -#define _HWCTX_XDNA_H_ - -#include - -#include "amdxdna_accel.h" -#include "shared.h" -#include "shim_debug.h" - -namespace shim_xdna { - -struct hw_q; // forward declaration -struct device; -struct bo; -struct cuidx_type; - -struct hw_ctx { - using qos_type = std::map; - enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; - - hw_ctx(const device& dev, const qos_type& qos, std::unique_ptr q); - - ~hw_ctx(); - - // TODO - void update_qos(const qos_type&) { shim_not_supported_err(__func__); } - - void update_access_mode(access_mode) { shim_not_supported_err(__func__); } - - uint32_t get_slotidx() const; - - hw_q* get_hw_queue(); - - std::unique_ptr alloc_bo(void* userptr, size_t size, uint64_t flags); - - std::unique_ptr alloc_bo(size_t size, uint64_t flags); - - std::unique_ptr import_bo(pid_t, shared_handle::export_handle); - - cuidx_type open_cu_context(const std::string& cuname); - - void close_cu_context(cuidx_type cuidx); - - void exec_buf(bo*) { shim_not_supported_err(__func__); } - - uint32_t get_doorbell() const; - - const device& get_device(); - - struct cu_info { - std::string m_name; - size_t m_func; - std::vector m_pdi; - }; - - const std::vector& get_cu_info() const; - - void set_slotidx(uint32_t id); - - void set_doorbell(uint32_t db); - - void create_ctx_on_device(); - - void init_log_buf(); - - void fini_log_buf(); - - const device& m_device; - uint32_t m_handle = AMDXDNA_INVALID_CTX_HANDLE; - amdxdna_qos_info m_qos = {}; - std::vector m_cu_info; - std::unique_ptr m_q; - uint32_t m_ops_per_cycle; - uint32_t m_num_cols; - uint32_t m_doorbell; - std::unique_ptr m_log_bo; - void* m_log_buf; - - void delete_ctx_on_device(); - - void init_qos_info(const qos_type& qos); -}; - -} // namespace shim_xdna - -#endif // _HWCTX_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp deleted file mode 100644 index 25bc89ad0..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "hwq.h" - -#include "bo.h" -#include "ert.h" -#include "fence.h" -#include "hwctx.h" -#include "pcidev.h" -#include "shim_debug.h" - -namespace { - -ert_packet *get_chained_command_pkt(shim_xdna::bo *boh) { - auto cmdpkt = - reinterpret_cast(boh->map(shim_xdna::bo::map_type::write)); - return cmdpkt->opcode == ERT_CMD_CHAIN ? cmdpkt : nullptr; -} - -int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, - shim_xdna::bo *cmd, uint32_t timeout_ms) { - int ret = 1; - auto boh = static_cast(cmd); - auto id = boh->get_cmd_id(); - - shim_xdna::shim_debug("Waiting for cmd (%ld)...", id); - - amdxdna_drm_wait_cmd wcmd = { - .hwctx = ctx->get_slotidx(), - .timeout = timeout_ms, - .seq = boh->get_cmd_id(), - }; - - pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd); - return ret; -} - -} // namespace - -namespace shim_xdna { - -hw_q::hw_q(const device &device) - : m_hwctx(nullptr), - m_queue_boh(AMDXDNA_INVALID_BO_HANDLE), - m_pdev(device.get_pdev()) {} - -void hw_q::bind_hwctx(const hw_ctx *ctx) { - m_hwctx = ctx; - shim_debug("Bond HW queue to HW context %d", m_hwctx->get_slotidx()); -} - -void hw_q::unbind_hwctx() { - shim_debug("Unbond HW queue from HW context %d", m_hwctx->get_slotidx()); - m_hwctx = nullptr; -} - -uint32_t hw_q::get_queue_bo() { return m_queue_boh; } - -void hw_q::submit_command(bo *cmd) { issue_command(cmd); } - -int hw_q::poll_command(bo *cmd) const { - auto cmdpkt = reinterpret_cast(cmd->map(bo::map_type::write)); - - if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) { - return 1; - } - return 0; -} - -int hw_q::wait_command(bo *cmd, uint32_t timeout_ms) const { - if (poll_command(cmd)) return 1; - return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms); -} - -void hw_q::submit_wait(const fence *f) { - auto fh = static_cast(f); - fh->submit_wait(m_hwctx); -} - -void hw_q::submit_wait(const std::vector &fences) { - fence::submit_wait(m_pdev, m_hwctx, fences); -} - -void hw_q::submit_signal(const fence *f) { - auto fh = static_cast(f); - fh->submit_signal(m_hwctx); -} - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h deleted file mode 100644 index 98442c49c..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef _HWQ_XDNA_H_ -#define _HWQ_XDNA_H_ -#include -#include - -namespace shim_xdna { -struct device; -struct bo; -struct hw_ctx; -struct pdev; -struct fence; - -struct hw_q { - hw_q(const device &device); - - void submit_command(bo *); - - int poll_command(bo *) const; - - int wait_command(bo *, uint32_t timeout_ms) const; - - void submit_wait(const fence *); - - void submit_wait(const std::vector &); - - void submit_signal(const fence *); - - void bind_hwctx(const hw_ctx *ctx); - - void unbind_hwctx(); - - uint32_t get_queue_bo(); - - virtual void issue_command(bo *) = 0; - - const hw_ctx *m_hwctx; - const pdev &m_pdev; - uint32_t m_queue_boh; -}; - -} // namespace shim_xdna - -#endif // _HWQ_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp deleted file mode 100644 index f078d3e00..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.cpp +++ /dev/null @@ -1,443 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "pcidev.h" - -#include -#include -#include -#include - -#include -#include - -#include "amdxdna_accel.h" -#include "bo.h" -#include "pcidrv.h" -#include "shim_debug.h" - -namespace { - -std::string ioctl_cmd2name(unsigned long cmd) { - switch (cmd) { - case DRM_IOCTL_AMDXDNA_CREATE_HWCTX: - return "DRM_IOCTL_AMDXDNA_CREATE_HWCTX"; - case DRM_IOCTL_AMDXDNA_DESTROY_HWCTX: - return "DRM_IOCTL_AMDXDNA_DESTROY_HWCTX"; - case DRM_IOCTL_AMDXDNA_CONFIG_HWCTX: - return "DRM_IOCTL_AMDXDNA_CONFIG_HWCTX"; - case DRM_IOCTL_AMDXDNA_CREATE_BO: - return "DRM_IOCTL_AMDXDNA_CREATE_BO"; - case DRM_IOCTL_AMDXDNA_GET_BO_INFO: - return "DRM_IOCTL_AMDXDNA_GET_BO_INFO"; - case DRM_IOCTL_AMDXDNA_SYNC_BO: - return "DRM_IOCTL_AMDXDNA_SYNC_BO"; - case DRM_IOCTL_AMDXDNA_EXEC_CMD: - return "DRM_IOCTL_AMDXDNA_EXEC_CMD"; - case DRM_IOCTL_AMDXDNA_WAIT_CMD: - return "DRM_IOCTL_AMDXDNA_WAIT_CMD"; - case DRM_IOCTL_AMDXDNA_GET_INFO: - return "DRM_IOCTL_AMDXDNA_GET_INFO"; - case DRM_IOCTL_AMDXDNA_SET_STATE: - return "DRM_IOCTL_AMDXDNA_SET_STATE"; - case DRM_IOCTL_GEM_CLOSE: - return "DRM_IOCTL_GEM_CLOSE"; - case DRM_IOCTL_PRIME_HANDLE_TO_FD: - return "DRM_IOCTL_PRIME_HANDLE_TO_FD"; - case DRM_IOCTL_PRIME_FD_TO_HANDLE: - return "DRM_IOCTL_PRIME_FD_TO_HANDLE"; - case DRM_IOCTL_SYNCOBJ_CREATE: - return "DRM_IOCTL_SYNCOBJ_CREATE"; - case DRM_IOCTL_SYNCOBJ_QUERY: - return "DRM_IOCTL_SYNCOBJ_QUERY"; - case DRM_IOCTL_SYNCOBJ_DESTROY: - return "DRM_IOCTL_SYNCOBJ_DESTROY"; - case DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD: - return "DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD"; - case DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE: - return "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE"; - case DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL: - return "DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL"; - case DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT: - return "DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT"; - } - - return "UNKNOWN(" + std::to_string(cmd) + ")"; -} - -size_t bar_size(const std::string& dir, unsigned bar) { - std::ifstream ifs(dir + "/resource"); - if (!ifs.good()) return 0; - std::string line; - for (unsigned i = 0; i <= bar; i++) { - line.clear(); - std::getline(ifs, line); - } - long long start, end, meta; - if (sscanf(line.c_str(), "0x%llx 0x%llx 0x%llx", &start, &end, &meta) != 3) - return 0; - return end - start + 1; -} - -int get_render_value(const std::string& dir, - const std::string& devnode_prefix) { - struct dirent* entry; - DIR* dp; - int instance_num = INVALID_ID; - - dp = opendir(dir.c_str()); - if (dp == nullptr) return instance_num; - - while ((entry = readdir(dp))) { - std::string dirname{entry->d_name}; - if (dirname.compare(0, devnode_prefix.size(), devnode_prefix) == 0) { - instance_num = std::stoi(dirname.substr(devnode_prefix.size())); - break; - } - } - - closedir(dp); - - return instance_num; -} - -bool is_admin() { return (getuid() == 0) || (geteuid() == 0); } - -const size_t dev_mem_size = (64 << 20); - -} // namespace - -namespace sysfs { - -static constexpr const char* dev_root = "/sys/bus/pci/devices/"; - -static std::string get_name(const std::string& dir, const std::string& subdir) { - std::string line; - std::ifstream ifs(dir + "/" + subdir + "/name"); - - if (ifs.is_open()) std::getline(ifs, line); - - return line; -} - -// Helper to find subdevice directory name -// Assumption: all subdevice's sysfs directory name starts with subdevice name!! -static int get_subdev_dir_name(const std::string& dir, - const std::string& subDevName, - std::string& subdir) { - DIR* dp; - size_t sub_nm_sz = subDevName.size(); - - subdir = ""; - if (subDevName.empty()) return 0; - - int ret = -ENOENT; - dp = opendir(dir.c_str()); - if (dp) { - struct dirent* entry; - while ((entry = readdir(dp))) { - std::string nm = get_name(dir, entry->d_name); - if (!nm.empty()) { - if (nm != subDevName) continue; - } else if (strncmp(entry->d_name, subDevName.c_str(), sub_nm_sz) != 0 || - entry->d_name[sub_nm_sz] != '.') { - continue; - } - // found it - subdir = entry->d_name; - ret = 0; - break; - } - closedir(dp); - } - - return ret; -} - -static std::string get_path(const std::string& name, const std::string& subdev, - const std::string& entry) { - std::string subdir; - if (get_subdev_dir_name(dev_root + name, subdev, subdir) != 0) return ""; - - std::string path = dev_root; - path += name; - path += "/"; - path += subdir; - path += "/"; - path += entry; - return path; -} - -static std::fstream open_path(const std::string& path, std::string& err, - bool write, bool binary) { - std::fstream fs; - std::ios::openmode mode = write ? std::ios::out : std::ios::in; - - if (binary) mode |= std::ios::binary; - - err.clear(); - fs.open(path, mode); - if (!fs.is_open()) { - std::stringstream ss; - ss << "Failed to open " << path << " for " << (binary ? "binary " : "") - << (write ? "writing" : "reading") << ": " << strerror(errno) - << std::endl; - err = ss.str(); - } - return fs; -} - -static std::fstream open(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, bool write, - bool binary) { - std::fstream fs; - auto path = get_path(name, subdev, entry); - - if (path.empty()) { - std::stringstream ss; - ss << "Failed to find subdirectory for " << subdev << " under " - << dev_root + name << std::endl; - err = ss.str(); - } else { - fs = open_path(path, err, write, binary); - } - - return fs; -} - -static void get(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, - std::vector& sv) { - std::fstream fs = open(name, subdev, entry, err, false, false); - if (!err.empty()) return; - - sv.clear(); - std::string line; - while (std::getline(fs, line)) sv.push_back(line); -} - -static void get(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, - std::vector& iv) { - iv.clear(); - - std::vector sv; - get(name, subdev, entry, err, sv); - if (!err.empty()) return; - - for (auto& s : sv) { - if (s.empty()) { - std::stringstream ss; - ss << "Reading " << get_path(name, subdev, entry) << ", "; - ss << "can't convert empty string to integer" << std::endl; - err = ss.str(); - break; - } - char* end = nullptr; - auto n = std::strtoull(s.c_str(), &end, 0); - if (*end != '\0') { - std::stringstream ss; - ss << "Reading " << get_path(name, subdev, entry) << ", "; - ss << "failed to convert string to integer: " << s << std::endl; - err = ss.str(); - break; - } - iv.push_back(n); - } -} - -static void get(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, std::string& s) { - std::vector sv; - get(name, subdev, entry, err, sv); - if (!sv.empty()) - s = sv[0]; - else - s = ""; // default value -} - -static void get(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, - std::vector& buf) { - std::fstream fs = open(name, subdev, entry, err, false, true); - if (!err.empty()) return; - - buf.clear(); - buf.insert(std::end(buf), std::istreambuf_iterator(fs), - std::istreambuf_iterator()); -} - -static void put(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, - const std::string& input) { - std::fstream fs = open(name, subdev, entry, err, true, false); - if (!err.empty()) return; - fs << input; - fs.close(); // flush and close, if either fails then stream failbit is set. - if (!fs.good()) { - std::stringstream ss; - ss << "Failed to write " << get_path(name, subdev, entry) << ": " - << strerror(errno) << std::endl; - err = ss.str(); - } -} - -static void put(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, - const std::vector& buf) { - std::fstream fs = open(name, subdev, entry, err, true, true); - if (!err.empty()) return; - - fs.write(buf.data(), buf.size()); - fs.close(); // flush and close, if either fails then stream failbit is set. - if (!fs.good()) { - std::stringstream ss; - ss << "Failed to write " << get_path(name, subdev, entry) << ": " - << strerror(errno) << std::endl; - err = ss.str(); - } -} - -static void put(const std::string& name, const std::string& subdev, - const std::string& entry, std::string& err, - const unsigned int& input) { - std::fstream fs = open(name, subdev, entry, err, true, false); - if (!err.empty()) return; - fs << input; - fs.close(); // flush and close, if either fails then stream failbit is set. - if (!fs.good()) { - std::stringstream ss; - ss << "Failed to write " << get_path(name, subdev, entry) << ": " - << strerror(errno) << std::endl; - err = ss.str(); - } -} - -} // namespace sysfs - -namespace shim_xdna { - -void pdev::sysfs_get(const std::string& subdev, const std::string& entry, - std::string& err, std::vector& ret) const { - sysfs::get(m_sysfs_name, subdev, entry, err, ret); -} - -pdev::pdev(std::shared_ptr driver, std::string sysfs_name) - : m_driver(std::move(driver)), m_sysfs_name(std::move(sysfs_name)) { - std::string err; - - if (sscanf(m_sysfs_name.c_str(), "%hx:%hx:%hx.%hx", &m_domain, &m_bus, &m_dev, - &m_func) < 4) - llvm::report_fatal_error(llvm::Twine(m_sysfs_name) + " is not valid BDF"); - - m_is_mgmt = !m_driver->is_user(); - - if (m_is_mgmt) { - sysfs_get("", "instance", err, m_instance, - static_cast(INVALID_ID)); - } else { - m_instance = get_render_value( - sysfs::dev_root + m_sysfs_name + "/" + m_driver->sysfs_dev_node_dir(), - m_driver->dev_node_prefix()); - } - - sysfs_get("", "userbar", err, m_user_bar, 0); - m_user_bar_size = bar_size(sysfs::dev_root + m_sysfs_name, m_user_bar); - sysfs_get("", "ready", err, m_is_ready, false); - m_user_bar_map = reinterpret_cast(MAP_FAILED); - m_is_ready = true; // We're always ready. -} - -pdev::~pdev() { - if (m_dev_fd != -1) shim_debug("Device node fd leaked!! fd=%d", m_dev_fd); -} - -std::string pdev::get_subdev_path(const std::string& subdev, uint idx) const { - // Main devfs path - if (subdev.empty()) { - std::string instStr = std::to_string(m_instance); - std::string prefixStr = "/dev/"; - prefixStr += m_driver->dev_node_dir() + "/" + m_driver->dev_node_prefix(); - return prefixStr + instStr; - } - - llvm::report_fatal_error("subdev path not supported"); -} - -int pdev::open(const std::string& subdev, uint32_t idx, int flag) const { - if (m_is_mgmt && !::is_admin()) - llvm::report_fatal_error("Root privileges required"); - - std::string devfs = get_subdev_path(subdev, idx); - return ::open(devfs.c_str(), flag); -} - -int pdev::open(const std::string& subdev, int flag) const { - return open(subdev, 0, flag); -} - -void pdev::open() const { - int fd; - const std::lock_guard lock(m_lock); - - if (m_dev_users == 0) { - fd = pdev::open("", O_RDWR); - if (fd < 0) - shim_err(EINVAL, "Failed to open KMQ device"); - else - shim_debug("Device opened, fd=%d", fd); - // Publish the fd for other threads to use. - m_dev_fd = fd; - } - ++m_dev_users; -} - -void pdev::close() const { - int fd; - const std::lock_guard lock(m_lock); - - --m_dev_users; - if (m_dev_users == 0) { - on_last_close(); - - // Stop new users of the fd from other threads. - fd = m_dev_fd; - m_dev_fd = -1; - // Kernel will wait for existing users to quit. - ::close(fd); - shim_debug("Device closed, fd=%d", fd); - } -} - -void pdev::ioctl(unsigned long cmd, void* arg) const { - if (::ioctl(m_dev_fd, cmd, arg) == -1) - shim_err(errno, "%s IOCTL failed", ioctl_cmd2name(cmd).c_str()); -} - -void* pdev::mmap(void* addr, size_t len, int prot, int flags, - off_t offset) const { - void* ret = ::mmap(addr, len, prot, flags, m_dev_fd, offset); - - if (ret == reinterpret_cast(-1)) - shim_err(errno, - "mmap(addr=%p, len=%ld, prot=%d, flags=%d, offset=%ld) failed", - addr, len, prot, flags, offset); - return ret; -} - -void pdev::munmap(void* addr, size_t len) const { ::munmap(addr, len); } - -std::shared_ptr pdev::create_device(void* handle) const { - auto dev = std::make_shared(*this, handle); - // Alloc device memory on first device creation. - // No locking is needed since driver will ensure only one heap BO is - // created. - if (m_dev_heap_bo == nullptr) - m_dev_heap_bo = - std::make_unique(*dev, dev_mem_size, AMDXDNA_BO_DEV_HEAP); - return dev; -} - -void pdev::on_last_close() const { m_dev_heap_bo.reset(); } - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h deleted file mode 100644 index a84fa646c..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidev.h +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef PCIDEV_XDNA_H -#define PCIDEV_XDNA_H - -#include - -#include - -#include "bo.h" -#include "device.h" -#include "pcidev.h" -#include "shim_debug.h" - -namespace shim_xdna { - -#define INVALID_ID 0xffff - -struct drv; - -struct pdev { - pdev(std::shared_ptr driver, std::string sysfs_name); - ~pdev(); - - void sysfs_get(const std::string& subdev, const std::string& entry, - std::string& err, std::vector& iv) const; - - template - void sysfs_get(const std::string& subdev, const std::string& entry, - std::string& err, T& i, const T& default_val) { - std::vector iv; - sysfs_get(subdev, entry, err, iv); - if (!iv.empty()) - i = static_cast(iv[0]); - else - i = static_cast(default_val); // default value - } - - std::string get_subdev_path(const std::string& subdev, uint32_t idx) const; - - std::shared_ptr create_device(void* handle) const; - - void ioctl(unsigned long cmd, void* arg) const; - - void* mmap(void* addr, size_t len, int prot, int flags, off_t offset) const; - - void munmap(void* addr, size_t len) const; - - int open(const std::string& subdev, uint32_t idx, int flag) const; - int open(const std::string& subdev, int flag) const; - - void open() const; - - void close() const; - - void on_last_close() const; - int map_usr_bar() const; - - // Virtual address of memory mapped BAR0, mapped on first use, once mapped, - // never change. - mutable char* m_user_bar_map = reinterpret_cast(MAP_FAILED); - - std::shared_ptr m_driver; - mutable int m_dev_fd = -1; - mutable int m_dev_users = 0; - mutable std::mutex m_lock; - uint16_t m_domain = INVALID_ID; - uint16_t m_bus = INVALID_ID; - uint16_t m_dev = INVALID_ID; - uint16_t m_func = INVALID_ID; - uint32_t m_instance = INVALID_ID; - std::string m_sysfs_name; // dir name under /sys/bus/pci/devices - int m_user_bar = 0; // BAR mapped in by tools, default is BAR0 - size_t m_user_bar_size = 0; - bool m_is_mgmt = false; - bool m_is_ready = false; - - // Create on first device creation and removed right before device is closed - mutable std::unique_ptr m_dev_heap_bo; -}; - -} // namespace shim_xdna - -#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp deleted file mode 100644 index 5841e1bb3..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. -// -#include "pcidrv.h" - -#include - -#include "amdxdna_accel.h" -#include "pcidev.h" - -namespace { - -amdxdna_device_type get_dev_type(const std::string& sysfs) { - const std::string sysfs_root{"/sys/bus/pci/devices/"}; - const std::string dev_type_path = sysfs_root + sysfs + "/device_type"; - - std::ifstream ifs(dev_type_path); - if (!ifs.is_open()) - llvm::report_fatal_error(llvm::Twine(dev_type_path) + " is missing?"); - - std::string line; - std::getline(ifs, line); - return static_cast(std::stoi(line)); -} - -} // namespace - -namespace shim_xdna { - -std::string drv::name() const { return "amdxdna"; } - -std::string drv::dev_node_prefix() const { return "accel"; } - -std::string drv::dev_node_dir() const { return "accel"; } - -std::string drv::sysfs_dev_node_dir() const { return "accel"; } - -bool drv::is_user() const { return true; } - -std::shared_ptr drv::create_pcidev(const std::string& sysfs) const { - auto t = get_dev_type(sysfs); - auto driver = std::static_pointer_cast(shared_from_this()); - if (t == AMDXDNA_DEV_TYPE_KMQ) return std::make_shared(driver, sysfs); - // if (t == AMDXDNA_DEV_TYPE_UMQ) - // return std::make_shared(driver, sysfs); - shim_err(-EINVAL, "Unknown device type: %d", t); -} - -} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h deleted file mode 100644 index 95cf8757e..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/pcidrv.h +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef _PCIDRV_XDNA_H_ -#define _PCIDRV_XDNA_H_ - -#include - -#include "pcidev.h" - -namespace shim_xdna { - -struct drv : std::enable_shared_from_this { - std::string name() const; - bool is_user() const; - std::string dev_node_prefix() const; - std::string dev_node_dir() const; - std::string sysfs_dev_node_dir() const; - std::shared_ptr create_pcidev(const std::string& sysfs) const; -}; - -} // namespace shim_xdna - -#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h deleted file mode 100644 index ff026880d..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shared.h +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. - -#ifndef _SHARED_XDNA_H_ -#define _SHARED_XDNA_H_ - -#include - -namespace shim_xdna { - -struct shared_handle { - shared_handle(int fd) : m_fd(fd) {} - ~shared_handle() { - if (m_fd != -1) close(m_fd); - } - using export_handle = int; - export_handle get_export_handle() const { return m_fd; } - - const int m_fd; -}; - -} // namespace shim_xdna - -#endif // _SHARED_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp deleted file mode 100644 index d761a3995..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// -// Created by mlevental on 10/2/24. -// - -#include -#include -#include - -static std::recursive_mutex s_debug_mutex; - -namespace shim_xdna { -struct debug_lock { - std::lock_guard m_lk; - debug_lock(); -}; - -debug_lock::debug_lock() : m_lk(s_debug_mutex) {} - -unsigned long time_ns() { - static auto zero = std::chrono::high_resolution_clock::now(); - auto now = std::chrono::high_resolution_clock::now(); - auto integral_duration = - std::chrono::duration_cast(now - zero).count(); - return static_cast(integral_duration); -} - -void debugf(const char* format, ...) { - debug_lock lk; - va_list args; - va_start(args, format); - printf("%lu: ", time_ns()); - vprintf(format, args); - va_end(args); -} -} // namespace shim_xdna \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h deleted file mode 100644 index e37dc2b55..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef SHIM_DEBUG_H -#define SHIM_DEBUG_H - -#include - -#include -#include -#include - -#include "llvm/Support/Error.h" - -namespace shim_xdna { - -void debugf(const char* format, ...); - -#define XRT_PRINTF(format, ...) debugf(format, ##__VA_ARGS__) // NOLINT - -template -[[noreturn]] void shim_err(int, const char* fmt, Args&&... args) { - std::string format = std::string(fmt); - format += " (err=%d)"; - int sz = std::snprintf(nullptr, 0, "%s", format.c_str(), args...) + 1; - if (sz <= 0) llvm::report_fatal_error("could not format error string"); - - auto size = static_cast(sz); - std::unique_ptr buf(new char[size]); - std::snprintf(buf.get(), size, "%s", format.c_str(), args...); - llvm::report_fatal_error(buf.get()); -} - -[[noreturn]] inline void shim_not_supported_err(const char* msg) { - shim_err(0, msg); -} - -template -void shim_debug(const char* fmt, Args&&... args) { - std::string format = "PID(%d): "; - format += std::string(fmt); - format += "\n"; - XRT_PRINTF(format.c_str(), getpid(), std::forward(args)...); -} - -template -void shim_info(const char* fmt, Args&&... args) { - std::string format = "PID(%d): "; - format += std::string(fmt); - format += "\n"; - XRT_PRINTF(format.c_str(), getpid(), std::forward(args)...); -} - -} // namespace shim_xdna - -#endif // SHIM_DEBUG_H From b972a9832ec575f7c0f13b63966612c244742593 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 10 Oct 2024 07:42:39 -0400 Subject: [PATCH 05/35] bring shim back in-tree --- .../driver/xrt-lite/CMakeLists.txt | 5 +- .../src/iree-amd-aie/driver/xrt-lite/api.h | 7 +- .../driver/xrt-lite/cts/CMakeLists.txt | 42 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 90 +- .../iree-amd-aie/driver/xrt-lite/driver.cc | 81 +- .../driver/xrt-lite/shim/CMakeLists.txt | 10 + .../driver/xrt-lite/shim/linux/CMakeLists.txt | 8 + .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 33 + .../xrt-lite/shim/linux/kmq/amdxdna_accel.h | 591 +++++++++ .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 477 +++++++ .../driver/xrt-lite/shim/linux/kmq/bo.h | 158 +++ .../driver/xrt-lite/shim/linux/kmq/device.cpp | 282 ++++ .../driver/xrt-lite/shim/linux/kmq/device.h | 70 + .../driver/xrt-lite/shim/linux/kmq/ert.h | 1163 +++++++++++++++++ .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 240 ++++ .../driver/xrt-lite/shim/linux/kmq/fence.h | 62 + .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 200 +++ .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 78 ++ .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 108 ++ .../driver/xrt-lite/shim/linux/kmq/hwq.h | 33 + .../xrt-lite/shim/linux/kmq/shim_debug.cpp | 34 + .../xrt-lite/shim/linux/kmq/shim_debug.h | 48 + 22 files changed, 3789 insertions(+), 31 deletions(-) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index 3c605d231..0fdb39b87 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -16,8 +16,6 @@ iree_register_external_hal_driver( iree_hal_xrt_lite_driver_module_register ) -find_package(ShimXDNA REQUIRED) - iree_cc_library( NAME xrt-lite @@ -31,8 +29,7 @@ iree_cc_library( iree::base::core_headers iree::base::internal::flatcc::parsing iree-amd-aie::schemas::xrt_executable_def_c_fbs - xrt_driver_xdna - $ + iree-amd-aie::driver::xrt-lite::shim::linux::kmq::shim-xdna COPTS $<$:-fexceptions -frtti> $<$:/EHsc /GR> diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index af257ac50..08c760682 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -44,8 +44,13 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( const struct iree_hal_xrt_lite_driver_options_t* options, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver); +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( + iree_string_view_t identifier, + const struct iree_hal_xrt_lite_device_options_t* options, + iree_allocator_t host_allocator, iree_hal_device_t** out_device); + #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index eacafbf82..e18221bda 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -6,6 +6,26 @@ include(CMakeDependentOption) +iree_hal_cts_test_suite( + DRIVER_NAME + xrt-lite + DRIVER_REGISTRATION_HDR + "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" + DRIVER_REGISTRATION_FN + "iree_hal_xrt_lite_driver_module_register" + COMPILER_TARGET_BACKEND + "amd-aie" + EXECUTABLE_FORMAT + "\"amdaie-xclbin-fb\"" + DEPS + iree-amd-aie::driver::xrt-lite::registration + INCLUDED_TESTS + "allocator" +# "buffer_mapping" +# "command_buffer" + "driver" +) + #set(PEANO_INSTALL_DIR "" CACHE PATH "") #set(VITIS_DIR "" CACHE PATH "") #if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) @@ -54,27 +74,7 @@ include(CMakeDependentOption) # PUBLIC # TESTONLY #) - -iree_hal_cts_test_suite( - DRIVER_NAME - xrt-lite - DRIVER_REGISTRATION_HDR - "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" - DRIVER_REGISTRATION_FN - "iree_hal_xrt_lite_driver_module_register" - COMPILER_TARGET_BACKEND - "amd-aie" - EXECUTABLE_FORMAT - "\"amdaie-xclbin-fb\"" - DEPS - iree-amd-aie::driver::xrt-lite::registration - INCLUDED_TESTS -# "allocator" -# "buffer_mapping" -# "command_buffer" - "driver" -) - +# #iree_cc_test( # NAME # xrt_lite_command_buffer_dispatch_test diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 4768c0cf7..ec4366945 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -7,13 +7,13 @@ #include "iree-amd-aie/driver/xrt-lite/device.h" #include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" struct iree_hal_xrt_lite_device_t { iree_hal_resource_t resource; iree_string_view_t identifier; - iree_allocator_t host_allocator; - iree_hal_allocator_t* device_allocator; + std::shared_ptr shim_device; }; namespace { @@ -29,6 +29,90 @@ void iree_hal_xrt_lite_device_options_initialize( // mechanism accessible here. } +iree_status_t iree_hal_xrt_lite_device_create( + iree_string_view_t identifier, + const iree_hal_xrt_lite_device_options_t* options, + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { + IREE_ASSERT_ARGUMENT(options); + IREE_ASSERT_ARGUMENT(out_device); + *out_device = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device_t* device = nullptr; + iree_host_size_t total_size = sizeof(*device) + identifier.size; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, total_size, (void**)&device)); + iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, + &device->resource); + iree_string_view_append_to_buffer( + identifier, &device->identifier, + reinterpret_cast(device) + total_size - identifier.size); + device->host_allocator = host_allocator; + + // TODO(null): pass device handles and pool configuration to the allocator. + // Some implementations may share allocators across multiple devices created + // from the same driver. + // TODO(max): + // iree_status_t status = iree_hal_xrt_lite_allocator_create( + // host_allocator, &device->device_allocator); + // TOOD(max): device id + + device->shim_device = std::make_shared(); + + iree_status_t status = iree_ok_status(); + + if (iree_status_is_ok(status)) { + *out_device = reinterpret_cast(device); + } else { + iree_hal_device_release(reinterpret_cast(device)); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static iree_hal_xrt_lite_device_t* iree_hal_xrt_lite_device_cast( + iree_hal_device_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_device_vtable); + return reinterpret_cast(base_value); +} + +static iree_string_view_t iree_hal_xrt_lite_device_id( + iree_hal_device_t* base_device) { + iree_hal_xrt_lite_device_t* device = + iree_hal_xrt_lite_device_cast(base_device); + return device->identifier; +} + +static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { + iree_hal_xrt_lite_device_t* device = + iree_hal_xrt_lite_device_cast(base_device); + iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device); + IREE_TRACE_ZONE_BEGIN(z0); + + // TODO(null): release all implementation resources here. It's expected that + // this is only called once all outstanding resources created with this device + // have been released by the application and no work is outstanding. If the + // implementation performs internal async operations those should be shutdown + // and joined first. + + device->shim_device.reset(); + iree_allocator_free(host_allocator, device); + + IREE_TRACE_ZONE_END(z0); +}; + +static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( + iree_hal_device_t* base_device) { + iree_hal_xrt_lite_device_t* device = + iree_hal_xrt_lite_device_cast(base_device); + return device->host_allocator; +} + namespace { -const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = {}; +const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { + .destroy = iree_hal_xrt_lite_device_destroy, + .id = iree_hal_xrt_lite_device_id, + .host_allocator = iree_hal_xrt_lite_device_host_allocator, +}; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 1f1c489f2..3143e6208 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "util.h" typedef struct iree_hal_xrt_lite_driver_t { @@ -22,7 +23,7 @@ extern const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable; static iree_hal_xrt_lite_driver_t* iree_hal_xrt_lite_driver_cast( iree_hal_driver_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_driver_vtable); - return (iree_hal_xrt_lite_driver_t*)base_value; + return reinterpret_cast(base_value); } void iree_hal_xrt_lite_driver_options_initialize( @@ -87,6 +88,7 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( } else { iree_hal_driver_release((iree_hal_driver_t*)driver); } + IREE_TRACE_ZONE_END(z0); return status; } @@ -104,9 +106,84 @@ static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { IREE_TRACE_ZONE_END(z0); } +#define IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT 0 + +static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( + iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, + iree_host_size_t* out_device_info_count, + iree_hal_device_info_t** out_device_infos) { + // TODO(null): query available devices and populate the output. Note that + // unlike most IREE functions this allocates if required in order to allow + // this to return uncached information. Uncached is preferred as it allows + // devices that may come and go (power toggles, user visibility toggles, etc) + // through a process lifetime to appear without needing a full restart. + static const iree_hal_device_info_t device_infos[1] = { + { + .device_id = IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT, + .name = iree_string_view_literal("default"), + }, + }; + *out_device_info_count = IREE_ARRAYSIZE(device_infos); + return iree_allocator_clone( + host_allocator, + iree_make_const_byte_span(device_infos, sizeof(device_infos)), + (void**)out_device_infos); +} + +static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( + iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, + iree_host_size_t param_count, const iree_string_pair_t* params, + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { + iree_hal_xrt_lite_driver_t* driver = + iree_hal_xrt_lite_driver_cast(base_driver); + + // TODO(null): use the provided params to overwrite the default options. The + // format of the params is implementation-defined. The params strings can be + // directly referenced if needed as the device creation is only allowed to + // access them during the create call below. + iree_hal_xrt_lite_device_options_t options = + driver->options.default_device_options; + + // TODO(null): implement creation by device_id; this is mostly used as + // query_available_devices->create_device_by_id to avoid needing to expose + // device paths (which may not always be 1:1). This skeleton only has a single + // device so the ID is ignored. + (void)driver; + + return iree_hal_xrt_lite_device_create(driver->identifier, &options, + host_allocator, out_device); +} + +static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( + iree_hal_driver_t* base_driver, iree_string_view_t driver_name, + iree_string_view_t device_path, iree_host_size_t param_count, + const iree_string_pair_t* params, iree_allocator_t host_allocator, + iree_hal_device_t** out_device) { + iree_hal_xrt_lite_driver_t* driver = + iree_hal_xrt_lite_driver_cast(base_driver); + + // TODO(null): use the provided params to overwrite the default options. The + // format of the params is implementation-defined. The params strings can be + // directly referenced if needed as the device creation is only allowed to + // access them during the create call below. + iree_hal_xrt_lite_device_options_t options = + driver->options.default_device_options; + + // TODO(null): support parsing of the device_path. Note that a single driver + // may respond to multiple driver_name queries. Paths are + // implementation-specific and there may be multiple formats; for example, + // device UUID, PCI bus ID, ordinal as used by underlying APIs, etc. + (void)driver; + + return iree_hal_xrt_lite_device_create(driver->identifier, &options, + host_allocator, out_device); +} + namespace { const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable = { .destroy = iree_hal_xrt_lite_driver_destroy, - .query_available_devices = unimplemented, + .query_available_devices = iree_hal_xrt_lite_driver_query_available_devices, + .create_device_by_id = iree_hal_xrt_lite_driver_create_device_by_id, + .create_device_by_path = iree_hal_xrt_lite_driver_create_device_by_path, }; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt new file mode 100644 index 000000000..ac1522216 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if(UNIX) + add_subdirectory(linux) +endif() \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt new file mode 100644 index 000000000..afe3d583a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_subdirectory(kmq) \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt new file mode 100644 index 000000000..f6a25b1d4 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_cc_library( + NAME + shim-xdna + SRCS + amdxdna_accel.h + bo.cpp + bo.h + device.cpp + device.h + ert.h + fence.cpp + fence.h + hwctx.cpp + hwctx.h + hwq.cpp + hwq.h + shim_debug.cpp + shim_debug.h + DEPS + uuid + $ + COPTS + $<$:-fexceptions -frtti> + $<$:/EHsc /GR> + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h new file mode 100644 index 000000000..cc8ec252f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h @@ -0,0 +1,591 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. + */ + +#ifndef AMDXDNA_ACCEL_H_ +#define AMDXDNA_ACCEL_H_ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#define AMDXDNA_DRIVER_MAJOR 1 +#define AMDXDNA_DRIVER_MINOR 0 + +#define AMDXDNA_INVALID_ADDR (~0UL) +#define AMDXDNA_INVALID_CTX_HANDLE 0 +#define AMDXDNA_INVALID_BO_HANDLE 0 +#define AMDXDNA_INVALID_FENCE_HANDLE 0 + +/* + * The interface can grow/extend over time. + * On each struct amdxdna_drm_*, to support potential extension, we defined it + * like this. + * + * Example code: + * + * struct amdxdna_drm_example_data { + * .ext = (uintptr_t)&example_data_ext; + * ... + * }; + * + * We don't have extension now. The extension struct will define in the future. + */ + +enum amdxdna_drm_ioctl_id { + DRM_AMDXDNA_CREATE_HWCTX, + DRM_AMDXDNA_DESTROY_HWCTX, + DRM_AMDXDNA_CONFIG_HWCTX, + DRM_AMDXDNA_CREATE_BO, + DRM_AMDXDNA_GET_BO_INFO, + DRM_AMDXDNA_SYNC_BO, + DRM_AMDXDNA_EXEC_CMD, + DRM_AMDXDNA_WAIT_CMD, + DRM_AMDXDNA_GET_INFO, + DRM_AMDXDNA_SET_STATE, + DRM_AMDXDNA_NUM_IOCTLS +}; + +enum amdxdna_device_type { + AMDXDNA_DEV_TYPE_UNKNOWN = -1, + AMDXDNA_DEV_TYPE_KMQ, + AMDXDNA_DEV_TYPE_UMQ, +}; + +/** + * struct qos_info - QoS information for driver. + * @gops: Giga operations per second. + * @fps: Frames per second. + * @dma_bandwidth: DMA bandwidtha. + * @latency: Frame response latency. + * @frame_exec_time: Frame execution time. + * @priority: Request priority. + * + * User program can provide QoS hints to driver. + */ +struct amdxdna_qos_info { + __u32 gops; + __u32 fps; + __u32 dma_bandwidth; + __u32 latency; + __u32 frame_exec_time; + __u32 priority; +}; + +/** + * struct amdxdna_drm_create_hwctx - Create hardware context. + * @ext: MBZ. + * @ext_flags: MBZ. + * @qos_p: Address of QoS info. + * @umq_bo: BO handle for user mode queue(UMQ). + * @log_buf_bo: BO handle for log buffer. + * @max_opc: Maximum operations per cycle. + * @num_tiles: Number of AIE tiles. + * @mem_size: Size of AIE tile memory. + * @umq_doorbell: Returned offset of doorbell associated with UMQ. + * @handle: Returned hardware context handle. + */ +struct amdxdna_drm_create_hwctx { + __u64 ext; + __u64 ext_flags; + __u64 qos_p; + __u32 umq_bo; + __u32 log_buf_bo; + __u32 max_opc; + __u32 num_tiles; + __u32 mem_size; + __u32 umq_doorbell; + __u32 handle; +}; + +/** + * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. + * @handle: Hardware context handle. + * @pad: MBZ. + */ +struct amdxdna_drm_destroy_hwctx { + __u32 handle; + __u32 pad; +}; + +/** + * struct amdxdna_cu_config - configuration for one CU + * @cu_bo: CU configuration buffer bo handle + * @cu_func: Functional of a CU + * @pad: MBZ + */ +struct amdxdna_cu_config { + __u32 cu_bo; + __u8 cu_func; + __u8 pad[3]; +}; + +/** + * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware + * context + * @num_cus: Number of CUs to configure + * @pad: MBZ + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + */ +struct amdxdna_hwctx_param_config_cu { + __u16 num_cus; + __u16 pad[3]; + struct amdxdna_cu_config cu_configs[1]; +}; + +enum amdxdna_drm_config_hwctx_param { + DRM_AMDXDNA_HWCTX_CONFIG_CU, + DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + DRM_AMDXDNA_HWCTX_CONFIG_NUM +}; + +/** + * struct amdxdna_drm_config_hwctx - Configure hardware context. + * @handle: hardware context handle. + * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the + * structure passed in via param_val. + * @param_val: A structure specified by the param_type struct member. + * @param_val_size: Size of the parameter buffer pointed to by the param_val. + * If param_val is not a pointer, driver can ignore this. + * + * Note: if the param_val is a pointer pointing to a buffer, the maximum size + * of the buffer is 4KiB(PAGE_SIZE). + */ +struct amdxdna_drm_config_hwctx { + __u32 handle; + __u32 param_type; + __u64 param_val; + __u32 param_val_size; + __u32 pad; +}; + +/* + * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo + * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory + * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP + * AMDXDNA_BO_CMD: User and driver accessible bo + * AMDXDNA_BO_DMA: DRM GEM DMA bo + */ +enum amdxdna_bo_type { + AMDXDNA_BO_INVALID = 0, + AMDXDNA_BO_SHMEM, + AMDXDNA_BO_DEV_HEAP, + AMDXDNA_BO_DEV, + AMDXDNA_BO_CMD, + AMDXDNA_BO_DMA, +}; + +/** + * struct amdxdna_drm_create_bo - Create a buffer object. + * @flags: Buffer flags. MBZ. + * @type: Buffer type. + * @vaddr: User VA of buffer if applied. MBZ. + * @size: Size in bytes. + * @handle: Returned DRM buffer object handle. + */ +struct amdxdna_drm_create_bo { + __u64 flags; + __u32 type; + __u32 _pad; + __u64 vaddr; + __u64 size; + __u32 handle; +}; + +/** + * struct amdxdna_drm_get_bo_info - Get buffer object information. + * @ext: MBZ. + * @ext_flags: MBZ. + * @handle: DRM buffer object handle. + * @map_offset: Returned DRM fake offset for mmap(). + * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). + * @xdna_addr: Returned XDNA device virtual address. + */ +struct amdxdna_drm_get_bo_info { + __u64 ext; + __u64 ext_flags; + __u32 handle; + __u32 _pad; + __u64 map_offset; + __u64 vaddr; + __u64 xdna_addr; +}; + +/** + * struct amdxdna_drm_sync_bo - Sync buffer object. + * @handle: Buffer object handle. + * @direction: Direction of sync, can be from device or to device. + * @offset: Offset in the buffer to sync. + * @size: Size in bytes. + */ +struct amdxdna_drm_sync_bo { + __u32 handle; +#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_FROM_DEVICE 1U + __u32 direction; + __u64 offset; + __u64 size; +}; + +enum amdxdna_cmd_type { + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, + AMDXDNA_CMD_SUBMIT_DEPENDENCY, + AMDXDNA_CMD_SUBMIT_SIGNAL, +}; + +/** + * struct amdxdna_drm_exec_cmd - Execute command. + * @ext: MBZ. + * @ext_flags: MBZ. + * @hwctx: Hardware context handle. + * @type: One of command type in enum amdxdna_cmd_type. + * @cmd_handles: Array of command handles or the command handle itself in case + * of just one. + * @args: Array of arguments for all command handles. + * @cmd_count: Number of command handles in the cmd_handles array. + * @arg_count: Number of arguments in the args array. + * @seq: Returned sequence number for this command. + */ +struct amdxdna_drm_exec_cmd { + __u64 ext; + __u64 ext_flags; + __u32 hwctx; + __u32 type; + __u64 cmd_handles; + __u64 args; + __u32 cmd_count; + __u32 arg_count; + __u64 seq; +}; + +/** + * struct amdxdna_drm_wait_cmd - Wait exectuion command. + * + * @hwctx: hardware context handle. + * @timeout: timeout in ms, 0 implies infinite wait. + * @seq: sequence number of the command returned by execute command. + * + * Wait a command specified by seq to be completed. + */ +struct amdxdna_drm_wait_cmd { + __u32 hwctx; + __u32 timeout; + __u64 seq; +}; + +/** + * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware + * @buffer: The user space buffer that will return the AIE status + * @buffer_size: The size of the user space buffer + * @cols_filled: A bitmap of AIE columns whose data has been returned in the + * buffer. + */ +struct amdxdna_drm_query_aie_status { + __u64 buffer; /* out */ + __u32 buffer_size; /* in */ + __u32 cols_filled; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware + * @major: The major version number + * @minor: The minor version number + */ +struct amdxdna_drm_query_aie_version { + __u32 major; /* out */ + __u32 minor; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile + * (core, mem, shim) + * @row_count: The number of rows. + * @row_start: The starting row number. + * @dma_channel_count: The number of dma channels. + * @lock_count: The number of locks. + * @event_reg_count: The number of events. + * @pad: MBZ. + */ +struct amdxdna_drm_query_aie_tile_metadata { + __u16 row_count; + __u16 row_start; + __u16 dma_channel_count; + __u16 lock_count; + __u16 event_reg_count; + __u16 pad[3]; +}; + +/** + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE + * hardware + * @col_size: The size of a column in bytes. + * @cols: The total number of columns. + * @rows: The total number of rows. + * @version: The version of the AIE hardware. + * @core: The metadata for all core tiles. + * @mem: The metadata for all mem tiles. + * @shim: The metadata for all shim tiles. + */ +struct amdxdna_drm_query_aie_metadata { + __u32 col_size; + __u16 cols; + __u16 rows; + struct amdxdna_drm_query_aie_version version; + struct amdxdna_drm_query_aie_tile_metadata core; + struct amdxdna_drm_query_aie_tile_metadata mem; + struct amdxdna_drm_query_aie_tile_metadata shim; +}; + +/** + * struct amdxdna_drm_query_clock - Metadata for a clock + * @name: The clock name. + * @freq_mhz: The clock frequency. + * @pad: MBZ. + */ +struct amdxdna_drm_query_clock { + __u8 name[16]; + __u32 freq_mhz; + __u32 pad; +}; + +/** + * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks + * @mp_npu_clock: The metadata for MP-NPU clock. + * @h_clock: The metadata for H clock. + */ +struct amdxdna_drm_query_clock_metadata { + struct amdxdna_drm_query_clock mp_npu_clock; + struct amdxdna_drm_query_clock h_clock; +}; + +enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; + +/** + * struct amdxdna_drm_query_sensor - The data for single sensor. + * @label: The name for a sensor. + * @input: The current value of the sensor. + * @max: The maximum value possible for the sensor. + * @average: The average value of the sensor. + * @highest: The highest recorded sensor value for this driver load for the + * sensor. + * @status: The sensor status. + * @units: The sensor units. + * @unitm: Translates value member variables into the correct unit via (pow(10, + * unitm) * value) + * @type: The sensor type from enum amdxdna_sensor_type + * @pad: MBZ. + */ +struct amdxdna_drm_query_sensor { + __u8 label[64]; + __u32 input; + __u32 max; + __u32 average; + __u32 highest; + __u8 status[64]; + __u8 units[16]; + __s8 unitm; + __u8 type; + __u8 pad[6]; +}; + +/** + * struct amdxdna_drm_query_hwctx - The data for single context. + * @context_id: The ID for this context. + * @start_col: The starting column for the partition assigned to this context. + * @num_col: The number of columns in the partition assigned to this context. + * @pid: The Process ID of the process that created this context. + * @command_submissions: The number of commands submitted to this context. + * @command_completions: The number of commands completed by this context. + * @migrations: The number of times this context has been moved to a different + * partition. + * @preemptions: The number of times this context has been preempted by another + * context in the same partition. + * @pad: MBZ. + */ +struct amdxdna_drm_query_hwctx { + __u32 context_id; + __u32 start_col; + __u32 num_col; + __u32 pad; + __s64 pid; + __u64 command_submissions; + __u64 command_completions; + __u64 migrations; + __u64 preemptions; + __u64 errors; +}; + +/** + * struct amdxdna_drm_aie_mem - The data for AIE memory read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE memory address to read/write + * @size: The size of bytes to read/write + * @buf_p: The buffer to store read/write data + * + * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM + * parameters. + */ +struct amdxdna_drm_aie_mem { + __u32 col; + __u32 row; + __u32 addr; + __u32 size; + __u64 buf_p; +}; + +/** + * struct amdxdna_drm_aie_reg - The data for AIE register read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE register address to read/write + * @val: The value to write or returned value from AIE + * + * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG + * parameters. + */ +struct amdxdna_drm_aie_reg { + __u32 col; + __u32 row; + __u32 addr; + __u32 val; +}; + +enum amdxdna_power_mode_type { + POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ + POWER_MODE_LOW, /**< Set frequency to lowest DPM */ + POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ + POWER_MODE_HIGH, /**< Set frequency to highest DPM */ + POWER_MODE_TURBO, /**< More power, more performance */ +}; + +/** + * struct amdxdna_drm_get_power_mode - Get the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_get_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +/** + * struct amdxdna_drm_query_firmware_version - Query the version of the firmware + * @major: The major version number + * @minor: The minor version number + * @patch: The patch level version number + * @build: The build ID + */ +struct amdxdna_drm_query_firmware_version { + __u32 major; /* out */ + __u32 minor; /* out */ + __u32 patch; /* out */ + __u32 build; /* out */ +}; + +enum amdxdna_drm_get_param { + DRM_AMDXDNA_QUERY_AIE_STATUS, + DRM_AMDXDNA_QUERY_AIE_METADATA, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_CLOCK_METADATA, + DRM_AMDXDNA_QUERY_SENSORS, + DRM_AMDXDNA_QUERY_HW_CONTEXTS, + DRM_AMDXDNA_READ_AIE_MEM, + DRM_AMDXDNA_READ_AIE_REG, + DRM_AMDXDNA_QUERY_FIRMWARE_VERSION, + DRM_AMDXDNA_GET_POWER_MODE, + DRM_AMDXDNA_QUERY_TELEMETRY, + DRM_AMDXDNA_NUM_GET_PARAM, +}; + +/** + * struct amdxdna_drm_get_info - Get some information from the AIE hardware. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. Size needed/written by the kernel. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_get_info { + __u32 param; /* in */ + __u32 buffer_size; /* in/out */ + __u64 buffer; /* in/out */ +}; + +/** + * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_set_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +enum amdxdna_drm_set_param { + DRM_AMDXDNA_SET_POWER_MODE, + DRM_AMDXDNA_WRITE_AIE_MEM, + DRM_AMDXDNA_WRITE_AIE_REG, + DRM_AMDXDNA_NUM_SET_PARAM, +}; + +/** + * struct amdxdna_drm_set_state - Set the state of some component within the AIE + * hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_set_state { + __u32 param; /* in */ + __u32 buffer_size; /* in */ + __u64 buffer; /* in */ +}; + +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) + +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) + +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) + +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) + +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) + +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) + +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) + +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) + +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) + +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) + +#if defined(__cplusplus) +} /* extern c end */ +#endif + +#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp new file mode 100644 index 000000000..7756865a1 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "bo.h" + +#include +#include +#include +#include + +#include "shim_debug.h" + +namespace { + +uint32_t alloc_drm_bo(const shim_xdna::pdev &dev, amdxdna_bo_type type, + void *buf, size_t size) { + amdxdna_drm_create_bo cbo = { + .type = static_cast(type), + .vaddr = reinterpret_cast(buf), + .size = size, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CREATE_BO, &cbo); + return cbo.handle; +} + +void free_drm_bo(const shim_xdna::pdev &dev, uint32_t boh) { + drm_gem_close close_bo = {boh, 0}; + dev.ioctl(DRM_IOCTL_GEM_CLOSE, &close_bo); +} + +void get_drm_bo_info(const shim_xdna::pdev &dev, uint32_t boh, + amdxdna_drm_get_bo_info *bo_info) { + bo_info->handle = boh; + dev.ioctl(DRM_IOCTL_AMDXDNA_GET_BO_INFO, bo_info); +} + +void *map_parent_range(size_t size) { + auto p = ::mmap(nullptr, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (!p) shim_xdna::shim_err(errno, "mmap(len=%ld) failed", size); + + return p; +} + +void *map_drm_bo(const shim_xdna::pdev &dev, size_t size, int prot, + uint64_t offset) { + return dev.mmap(nullptr, size, prot, MAP_SHARED | MAP_LOCKED, offset); +} + +void *map_drm_bo(const shim_xdna::pdev &dev, void *addr, size_t size, int prot, + int flags, uint64_t offset) { + return dev.mmap(addr, size, prot, flags, offset); +} + +void unmap_drm_bo(const shim_xdna::pdev &dev, void *addr, size_t size) { + munmap(addr, size); +} + +void attach_dbg_drm_bo(const shim_xdna::pdev &dev, uint32_t boh, + uint32_t ctx_id) { + amdxdna_drm_config_hwctx adbo = { + .handle = ctx_id, + .param_type = DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + .param_val = boh, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); +} + +void detach_dbg_drm_bo(const shim_xdna::pdev &dev, uint32_t boh, + uint32_t ctx_id) { + amdxdna_drm_config_hwctx adbo = { + .handle = ctx_id, + .param_type = DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + .param_val = boh, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); +} + +int export_drm_bo(const shim_xdna::pdev &dev, uint32_t boh) { + drm_prime_handle exp_bo = {boh, DRM_RDWR | DRM_CLOEXEC, -1}; + dev.ioctl(DRM_IOCTL_PRIME_HANDLE_TO_FD, &exp_bo); + return exp_bo.fd; +} + +uint32_t import_drm_bo(const shim_xdna::pdev &dev, + const shim_xdna::shared_handle &share, + amdxdna_bo_type *type, size_t *size) { + int fd = share.get_export_handle(); + drm_prime_handle imp_bo = {AMDXDNA_INVALID_BO_HANDLE, 0, fd}; + dev.ioctl(DRM_IOCTL_PRIME_FD_TO_HANDLE, &imp_bo); + + *type = AMDXDNA_BO_SHMEM; + *size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + + return imp_bo.handle; +} + +bool is_power_of_two(size_t x) { return x > 0 && (x & x - 1) == 0; } + +void *addr_align(void *p, size_t align) { + if (!is_power_of_two(align)) + shim_xdna::shim_err(EINVAL, "Alignment 0x%lx is not power of two", align); + + return reinterpret_cast((uintptr_t)p + align & ~(align - 1)); +} + +amdxdna_bo_type flag_to_type(uint64_t bo_flags) { + auto flags = xcl_bo_flags{bo_flags}; + auto boflags = (static_cast(flags.boflags) << 24); + switch (boflags) { + case XCL_BO_FLAGS_NONE: + case XCL_BO_FLAGS_HOST_ONLY: + return AMDXDNA_BO_SHMEM; + case XCL_BO_FLAGS_CACHEABLE: + return AMDXDNA_BO_DEV; + case XCL_BO_FLAGS_EXECBUF: + return AMDXDNA_BO_CMD; + default: + break; + } + return AMDXDNA_BO_INVALID; +} + +// flash cache line for non coherence memory +inline void clflush_data(const void *base, size_t offset, size_t len) { + static long cacheline_size = 0; + + if (!cacheline_size) { + long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sz <= 0) + shim_xdna::shim_err(EINVAL, "Invalid cache line size: %ld", sz); + cacheline_size = sz; + } + + const char *cur = (const char *)base; + cur += offset; + uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1); + do { + _mm_clflush(cur); + cur += cacheline_size; + } while (cur <= (const char *)lastline); +} + +void sync_drm_bo(const shim_xdna::pdev &dev, uint32_t boh, + shim_xdna::direction dir, size_t offset, size_t len) { + amdxdna_drm_sync_bo sbo = { + .handle = boh, + .direction = + (dir == shim_xdna::direction::host2device ? SYNC_DIRECT_TO_DEVICE + : SYNC_DIRECT_FROM_DEVICE), + .offset = offset, + .size = len, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_SYNC_BO, &sbo); +} + +bool is_driver_sync() { + static int drv_sync = -1; + + if (drv_sync == -1) { + bool ds = std::getenv("Debug.force_driver_sync"); + drv_sync = ds ? 1 : 0; + } + return drv_sync == 1; +} + +} // namespace + +namespace shim_xdna { + +drm_bo::drm_bo(bo &parent, const amdxdna_drm_get_bo_info &bo_info) + : m_parent(parent), + m_handle(bo_info.handle), + m_map_offset(bo_info.map_offset), + m_xdna_addr(bo_info.xdna_addr), + m_vaddr(bo_info.vaddr) {} + +drm_bo::~drm_bo() { + if (m_handle == AMDXDNA_INVALID_BO_HANDLE) return; + try { + free_drm_bo(m_parent.m_pdev, m_handle); + } catch (const std::system_error &e) { + shim_debug("Failed to free DRM BO: %s", e.what()); + } +} + +std::string bo::type_to_name() const { + switch (m_type) { + case AMDXDNA_BO_SHMEM: + return {"AMDXDNA_BO_SHMEM"}; + case AMDXDNA_BO_DEV_HEAP: + return {"AMDXDNA_BO_DEV_HEAP"}; + case AMDXDNA_BO_DEV: + if (xcl_bo_flags{m_flags}.use == XRT_BO_USE_DEBUG) + return {"AMDXDNA_BO_DEV_DEBUG"}; + return {"AMDXDNA_BO_DEV"}; + case AMDXDNA_BO_CMD: + return {"AMDXDNA_BO_CMD"}; + default:; + return {"BO_UNKNOWN"}; + } + return {"BO_UNKNOWN"}; +} + +std::string bo::describe() const { + std::string desc = "type="; + desc += type_to_name(); + desc += ", "; + desc += "drm_bo="; + desc += std::to_string(m_drm_bo->m_handle); + desc += ", "; + desc += "size="; + desc += std::to_string(m_aligned_size); + return desc; +} + +void bo::mmap_bo(size_t align) { + size_t a = align; + + if (m_drm_bo->m_map_offset == AMDXDNA_INVALID_ADDR) { + m_aligned = reinterpret_cast(m_drm_bo->m_vaddr); + return; + } + + if (a == 0) { + m_aligned = map_drm_bo(m_pdev, m_aligned_size, PROT_READ | PROT_WRITE, + m_drm_bo->m_map_offset); + return; + } + + /* + * Handle special alignment + * The first mmap() is just for reserved a range in user vritual address + * space. The second mmap() uses an aligned addr as the first argument in mmap + * syscall. + */ + m_parent_size = align * 2 - 1; + m_parent = map_parent_range(m_parent_size); + auto aligned = addr_align(m_parent, align); + m_aligned = + map_drm_bo(m_pdev, aligned, m_aligned_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, m_drm_bo->m_map_offset); +} + +void bo::munmap_bo() { + shim_debug("Unmap BO, aligned %p parent %p", m_aligned, m_parent); + if (m_drm_bo->m_map_offset == AMDXDNA_INVALID_ADDR) return; + + unmap_drm_bo(m_pdev, m_aligned, m_aligned_size); + if (m_parent) unmap_drm_bo(m_pdev, m_parent, m_parent_size); +} + +void bo::alloc_bo() { + uint32_t boh = alloc_drm_bo(m_pdev, m_type, nullptr, m_aligned_size); + + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_drm_bo = std::make_unique(*this, bo_info); +} + +void bo::import_bo() { + uint32_t boh = import_drm_bo(m_pdev, m_import, &m_type, &m_aligned_size); + + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_drm_bo = std::make_unique(*this, bo_info); +} + +void bo::free_bo() { m_drm_bo.reset(); } + +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, uint64_t flags) + : bo(p, ctx_id, size, flags, flag_to_type(flags)) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + +bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, uint64_t flags, + amdxdna_bo_type type) + : m_pdev(pdev), + m_aligned_size(size), + m_flags(flags), + m_type(type), + m_import(-1), + m_owner_ctx_id(ctx_id) { + size_t align = 0; + + if (m_type == AMDXDNA_BO_DEV_HEAP) + align = 64 * 1024 * 1024; // Device mem heap must align at 64MB boundary. + + alloc_bo(); + mmap_bo(align); + + // Newly allocated buffer may contain dirty pages. If used as output buffer, + // the data in cacheline will be flushed onto memory and pollute the output + // from device. We perform a cache flush right after the BO is allocated to + // avoid this issue. + if (m_type == AMDXDNA_BO_SHMEM) sync(direction::host2device, size, 0); + + attach_to_ctx(); +#ifndef NDEBUG + switch (m_flags) { + case 0x0: + shim_debug("allocating dev heap"); + break; + case 0x1000000: + // pdi bo + shim_debug("allocating pdi bo"); + break; + case 0x20000000: + // XCL_BO_FLAGS_P2P in create_free_bo test + shim_debug("allocating XCL_BO_FLAGS_P2P"); + break; + case 0x80000000: + // XCL_BO_FLAGS_EXECBUF in create_free_bo test + shim_debug("allocating XCL_BO_FLAGS_EXECBUF"); + break; + case 0x1001000000: + // debug bo + shim_debug("allocating debug bo"); + break; + default: + shim_err(-1, "unknown flags %d", flags); + } +#endif + + shim_debug( + "Allocated KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, " + "type=%d, drm_bo=%d)", + m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); +} + +bo::bo(const pdev &p, int ehdl) : m_pdev(p), m_import(ehdl) { + import_bo(); + mmap_bo(); + shim_debug( + "Imported KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " + "drm_bo=%d)", + m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); +} + +bo::~bo() { + shim_debug("Freeing KMQ BO, %s", describe().c_str()); + + munmap_bo(); + try { + detach_from_ctx(); + // If BO is in use, we should block and wait in driver + free_bo(); + } catch (const std::system_error &e) { + shim_debug("Failed to free BO: %s", e.what()); + } +} + +bo::bo(const pdev &p, size_t size, amdxdna_bo_type type) + : bo(p, AMDXDNA_INVALID_CTX_HANDLE, size, 0, type) {} + +properties bo::get_properties() const { + return {m_flags, m_aligned_size, get_paddr(), get_drm_bo_handle()}; +} + +void *bo::map(map_type type) const { + if (type != map_type::write) + shim_err( + EINVAL, + "Not support map BO as readonly. Type must be bo::map_type::write"); + return m_aligned; +} + +void bo::unmap(void *addr) {} + +uint64_t bo::get_paddr() const { + if (m_drm_bo->m_xdna_addr != AMDXDNA_INVALID_ADDR) + return m_drm_bo->m_xdna_addr; + return reinterpret_cast(m_aligned); +} + +void bo::set_cmd_id(uint64_t id) { m_cmd_id = id; } + +uint64_t bo::get_cmd_id() const { return m_cmd_id; } + +uint32_t bo::get_drm_bo_handle() const { return m_drm_bo->m_handle; } + +void bo::attach_to_ctx() { + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; + + auto boh = get_drm_bo_handle(); + shim_debug("Attaching drm_bo %d to ctx: %d", boh, m_owner_ctx_id); + attach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); +} + +void bo::detach_from_ctx() { + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; + + auto boh = get_drm_bo_handle(); + shim_debug("Detaching drm_bo %d from ctx: %d", boh, m_owner_ctx_id); + detach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); +} + +std::unique_ptr bo::share() const { + auto boh = get_drm_bo_handle(); + auto fd = export_drm_bo(m_pdev, boh); + shim_debug("Exported bo %d to fd %d", boh, fd); + return std::make_unique(fd); +} + +amdxdna_bo_type bo::get_type() const { return m_type; } + +void bo::sync(direction dir, size_t size, size_t offset) { + if (is_driver_sync()) { + sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); + return; + } + + if (offset + size > m_aligned_size) + shim_err(EINVAL, "Invalid BO offset and size for sync'ing: %ld, %ld", + offset, size); + + switch (m_type) { + case AMDXDNA_BO_SHMEM: + case AMDXDNA_BO_CMD: + clflush_data(m_aligned, offset, size); + break; + case AMDXDNA_BO_DEV: + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) + clflush_data(m_aligned, offset, size); + else + sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); + break; + default: + shim_err(ENOTSUP, "Can't sync bo type %d", m_type); + } +} + +void bo::bind_at(size_t pos, const bo *bh, size_t offset, size_t size) { + auto boh = reinterpret_cast(bh); + std::lock_guard lg(m_args_map_lock); + + if (m_type != AMDXDNA_BO_CMD) + shim_err(EINVAL, "Can't call bind_at() on non-cmd BO"); + + if (!pos) m_args_map.clear(); + + if (boh->get_type() != AMDXDNA_BO_CMD) { + auto h = boh->get_drm_bo_handle(); + m_args_map[pos] = h; + shim_debug("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); + } else { + const size_t max_args_order = 6; + const size_t max_args = 1 << max_args_order; + size_t key = pos << max_args_order; + uint32_t hs[max_args]; + auto arg_cnt = boh->get_arg_bo_handles(hs, max_args); + std::string bohs; + for (int i = 0; i < arg_cnt; i++) { + m_args_map[key + i] = hs[i]; + bohs += std::to_string(hs[i]) + " "; + } + shim_debug("Added arg BO %s to cmd BO %d", bohs.c_str(), + get_drm_bo_handle()); + } +} + +uint32_t bo::get_arg_bo_handles(uint32_t *handles, size_t num) const { + std::lock_guard lg(m_args_map_lock); + + auto sz = m_args_map.size(); + if (sz > num) + shim_err(E2BIG, "There are %ld BO args, provided buffer can hold only %ld", + sz, num); + + for (auto m : m_args_map) *(handles++) = m.second; + + return sz; +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h new file mode 100644 index 000000000..73567f262 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _BO_XDNA_H_ +#define _BO_XDNA_H_ + +#include + +#include "amdxdna_accel.h" +#include "device.h" +#include "hwctx.h" + +namespace shim_xdna { + +#define XRT_BO_USE_NORMAL 0 +#define XRT_BO_USE_DEBUG 1 + +/** + * XCL BO Flags bits layout + * + * bits 0 ~ 15: DDR BANK index + * bits 24 ~ 31: BO flags + */ +#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) +#define XCL_BO_FLAGS_NONE (0) +#define XCL_BO_FLAGS_CACHEABLE (1U << 24) +#define XCL_BO_FLAGS_KERNBUF (1U << 25) +#define XCL_BO_FLAGS_SGL (1U << 26) +#define XCL_BO_FLAGS_SVM (1U << 27) +#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) +#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) +#define XCL_BO_FLAGS_P2P (1U << 30) +#define XCL_BO_FLAGS_EXECBUF (1U << 31) + +/** + * Encoding of flags passed to xcl buffer allocation APIs + */ +struct xcl_bo_flags { + union { + uint64_t all; // [63-0] + + struct { + uint32_t flags; // [31-0] + uint32_t extension; // [63-32] + }; + + struct { + uint16_t bank; // [15-0] + uint8_t slot; // [23-16] + uint8_t boflags; // [31-24] + + // extension + uint32_t access : 2; // [33-32] + uint32_t dir : 2; // [35-34] + uint32_t use : 1; // [36] + uint32_t unused : 27; // [63-35] + }; + }; +}; + +// map_type - determines how a buffer is mapped +enum class map_type { read, write }; + +enum xclBOSyncDirection { + XCL_BO_SYNC_BO_TO_DEVICE = 0, + XCL_BO_SYNC_BO_FROM_DEVICE, + XCL_BO_SYNC_BO_GMIO_TO_AIE, + XCL_BO_SYNC_BO_AIE_TO_GMIO, +}; + +// direction - direction of sync operation +enum class direction { + host2device = XCL_BO_SYNC_BO_TO_DEVICE, + device2host = XCL_BO_SYNC_BO_FROM_DEVICE, +}; + +// properties - buffer details +struct properties { + uint64_t flags; // flags of bo + uint64_t size; // size of bo + uint64_t paddr; // physical address + uint64_t kmhdl; // kernel mode handle +}; + +struct drm_bo { + bo &m_parent; + uint32_t m_handle = AMDXDNA_INVALID_BO_HANDLE; + off_t m_map_offset = AMDXDNA_INVALID_ADDR; + uint64_t m_xdna_addr = AMDXDNA_INVALID_ADDR; + uint64_t m_vaddr = AMDXDNA_INVALID_ADDR; + + drm_bo(bo &parent, const amdxdna_drm_get_bo_info &bo_info); + ~drm_bo(); +}; + +struct bo { + const pdev &m_pdev; + void *m_parent = nullptr; + void *m_aligned = nullptr; + size_t m_parent_size = 0; + size_t m_aligned_size = 0; + uint64_t m_flags = 0; + amdxdna_bo_type m_type = AMDXDNA_BO_INVALID; + std::unique_ptr m_drm_bo; + const shared_handle m_import; + // Only for AMDXDNA_BO_CMD type + std::map m_args_map; + mutable std::mutex m_args_map_lock; + + // Command ID in the queue after command submission. + // Only valid for cmd BO. + uint64_t m_cmd_id = -1; + + // Used when exclusively assigned to a HW context. By default, BO is shared + // among all HW contexts. + uint32_t m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; + + bo(const pdev &p, uint32_t ctx_id, size_t size, uint64_t flags, + amdxdna_bo_type type); + bo(const pdev &p, uint32_t ctx_id, size_t size, uint64_t flags); + bo(const pdev &p, int ehdl); + // Support BO creation from internal + bo(const pdev &p, size_t size, amdxdna_bo_type type); + ~bo(); + + void *map(map_type) const; + void unmap(void *addr); + void sync(direction, size_t size, size_t offset); + properties get_properties() const; + std::unique_ptr share() const; + // For cmd BO only + void set_cmd_id(uint64_t id); + // For cmd BO only + uint64_t get_cmd_id() const; + uint32_t get_drm_bo_handle() const; + amdxdna_bo_type get_type() const; + // DRM BO managed by driver. + void bind_at(size_t pos, const bo *bh, size_t offset, size_t size); + std::string describe() const; + // Alloc DRM BO from driver + void alloc_bo(); + // Import DRM BO from m_import shared object + void import_bo(); + // Free DRM BO in driver + void free_bo(); + void mmap_bo(size_t align = 0); + void munmap_bo(); + uint64_t get_paddr() const; + std::string type_to_name() const; + void attach_to_ctx(); + void detach_from_ctx(); + // Obtain array of arg BO handles, returns real number of handles + uint32_t get_arg_bo_handles(uint32_t *handles, size_t num) const; +}; + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp new file mode 100644 index 000000000..191741170 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. - All rights reserved + +#include "device.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bo.h" +#include "fence.h" +#include "hwctx.h" + +namespace { + +int64_t import_fd(pid_t pid, int ehdl) { + if (pid == 0 || getpid() == pid) return ehdl; + +#if defined(SYS_pidfd_open) && defined(SYS_pidfd_getfd) + auto pidfd = syscall(SYS_pidfd_open, pid, 0); + if (pidfd < 0) + throw std::system_error(errno, std::system_category(), "pidfd_open failed"); + + int64_t fd = syscall(SYS_pidfd_getfd, pidfd, ehdl, 0); + if (fd < 0) { + if (errno == EPERM) { + throw std::system_error( + errno, std::system_category(), + "pidfd_getfd failed, check that ptrace access mode " + "allows PTRACE_MODE_ATTACH_REALCREDS. For more details please " + "check /etc/sysctl.d/10-ptrace.conf"); + } + + throw std::system_error(errno, std::system_category(), + "pidfd_getfd failed"); + } + return fd; +#else + throw std::system_error( + int(std::errc::not_supported), std::system_category(), + "Importing buffer object from different process requires XRT " + " built and installed on a system with 'pidfd' kernel support"); + return -1; +#endif +} + +std::string ioctl_cmd2name(unsigned long cmd) { + switch (cmd) { + case DRM_IOCTL_AMDXDNA_CREATE_HWCTX: + return "DRM_IOCTL_AMDXDNA_CREATE_HWCTX"; + case DRM_IOCTL_AMDXDNA_DESTROY_HWCTX: + return "DRM_IOCTL_AMDXDNA_DESTROY_HWCTX"; + case DRM_IOCTL_AMDXDNA_CONFIG_HWCTX: + return "DRM_IOCTL_AMDXDNA_CONFIG_HWCTX"; + case DRM_IOCTL_AMDXDNA_CREATE_BO: + return "DRM_IOCTL_AMDXDNA_CREATE_BO"; + case DRM_IOCTL_AMDXDNA_GET_BO_INFO: + return "DRM_IOCTL_AMDXDNA_GET_BO_INFO"; + case DRM_IOCTL_AMDXDNA_SYNC_BO: + return "DRM_IOCTL_AMDXDNA_SYNC_BO"; + case DRM_IOCTL_AMDXDNA_EXEC_CMD: + return "DRM_IOCTL_AMDXDNA_EXEC_CMD"; + case DRM_IOCTL_AMDXDNA_WAIT_CMD: + return "DRM_IOCTL_AMDXDNA_WAIT_CMD"; + case DRM_IOCTL_AMDXDNA_GET_INFO: + return "DRM_IOCTL_AMDXDNA_GET_INFO"; + case DRM_IOCTL_AMDXDNA_SET_STATE: + return "DRM_IOCTL_AMDXDNA_SET_STATE"; + case DRM_IOCTL_GEM_CLOSE: + return "DRM_IOCTL_GEM_CLOSE"; + case DRM_IOCTL_PRIME_HANDLE_TO_FD: + return "DRM_IOCTL_PRIME_HANDLE_TO_FD"; + case DRM_IOCTL_PRIME_FD_TO_HANDLE: + return "DRM_IOCTL_PRIME_FD_TO_HANDLE"; + case DRM_IOCTL_SYNCOBJ_CREATE: + return "DRM_IOCTL_SYNCOBJ_CREATE"; + case DRM_IOCTL_SYNCOBJ_QUERY: + return "DRM_IOCTL_SYNCOBJ_QUERY"; + case DRM_IOCTL_SYNCOBJ_DESTROY: + return "DRM_IOCTL_SYNCOBJ_DESTROY"; + case DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD: + return "DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD"; + case DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE: + return "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE"; + case DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL: + return "DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL"; + case DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT: + return "DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT"; + default: + return "UNKNOWN(" + std::to_string(cmd) + ")"; + } + return "UNKNOWN(" + std::to_string(cmd) + ")"; +} + +// Device memory heap needs to be within one 64MB page. The maximum size is +// 64MB. +const size_t dev_mem_size = (64 << 20); +} // namespace + +namespace shim_xdna { + +pdev::pdev() { + const std::lock_guard lock(m_lock); + // TODO(max): hardcoded + m_dev_fd = ::open("/dev/accel/accel0", O_RDWR); + if (m_dev_fd < 0) shim_err(EINVAL, "Failed to open KMQ device"); + shim_debug("Device opened, fd=%d", m_dev_fd); + m_dev_heap_bo = + std::make_unique(*this, dev_mem_size, AMDXDNA_BO_DEV_HEAP); + shim_debug("Created KMQ pcidev"); +} + +pdev::~pdev() { + shim_debug("Destroying KMQ pcidev"); + const std::lock_guard lock(m_lock); + m_dev_heap_bo.reset(); + ::close(m_dev_fd); + shim_debug("Device closed, fd=%d", m_dev_fd); + shim_debug("Destroyed KMQ pcidev"); +} + +void pdev::ioctl(unsigned long cmd, void *arg) const { + if (::ioctl(m_dev_fd, cmd, arg) == -1) + shim_err(errno, "%s IOCTL failed", ioctl_cmd2name(cmd).c_str()); +} + +void *pdev::mmap(void *addr, size_t len, int prot, int flags, + off_t offset) const { + void *ret = ::mmap(addr, len, prot, flags, m_dev_fd, offset); + if (ret == reinterpret_cast(-1)) + shim_err(errno, + "mmap(addr=%p, len=%ld, prot=%d, flags=%d, offset=%ld) failed", + addr, len, prot, flags, offset); + return ret; +} + +device::device() { shim_debug("Created KMQ device"); } + +device::~device() { shim_debug("Destroying KMQ device"); } + +const pdev &device::get_pdev() const { return m_pdev; } + +xrt::xclbin device::get_xclbin(const xrt::uuid &xclbin_id) const { + // Allow access to xclbin in process of loading via device::load_xclbin + if (xclbin_id && xclbin_id == m_xclbin.get_uuid()) return m_xclbin; + throw std::runtime_error("TODO(max):multi-xclbin"); +} + +std::unique_ptr device::create_hw_context( + const xrt::uuid &xclbin_uuid, const std::map &qos) { + return std::make_unique(*this, get_xclbin(xclbin_uuid), qos); +} + +std::unique_ptr device::alloc_bo(size_t size, uint64_t flags) { + return alloc_bo(nullptr, size, flags); +} + +std::unique_ptr device::alloc_bo(void *userptr, size_t size, + uint64_t flags) { + return alloc_bo(userptr, AMDXDNA_INVALID_CTX_HANDLE, size, flags); +} + +std::unique_ptr device::import_bo(pid_t pid, int ehdl) { + return import_bo(import_fd(pid, ehdl)); +} + +std::unique_ptr device::create_fence(fence_handle::access_mode) { + return std::make_unique(*this); +} + +std::unique_ptr device::import_fence(pid_t pid, int ehdl) { + return std::make_unique(*this, import_fd(pid, ehdl)); +} + +void device::record_xclbin(const xrt::xclbin &xclbin) { + std::lock_guard lk(m_mutex); + m_xclbin = xclbin; +} + +std::unique_ptr device::alloc_bo(void *userptr, uint32_t ctx_id, + size_t size, uint64_t flags) { + if (userptr) shim_not_supported_err("User ptr BO"); + + return std::make_unique(this->m_pdev, ctx_id, size, flags); +} + +std::unique_ptr device::import_bo(int ehdl) const { + return std::make_unique(this->m_pdev, ehdl); +} + +std::vector device::read_aie_mem(uint16_t col, uint16_t row, + uint32_t offset, uint32_t size) { + amdxdna_drm_aie_mem mem{}; + std::vector store_buf(size); + mem.col = col; + mem.row = row; + mem.addr = offset; + mem.size = size; + mem.buf_p = reinterpret_cast(store_buf.data()); + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_MEM, + .buffer_size = sizeof(mem), + .buffer = reinterpret_cast(&mem)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); + return store_buf; +} + +uint32_t device::read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr) { + amdxdna_drm_aie_reg reg{}; + reg.col = col; + reg.row = row; + reg.addr = reg_addr; + reg.val = 0; + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_REG, + .buffer_size = sizeof(reg), + .buffer = reinterpret_cast(®)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); + return reg.val; +} + +size_t device::write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + const std::vector &buf) { + amdxdna_drm_aie_mem mem{}; + uint32_t size = static_cast(buf.size()); + mem.col = col; + mem.row = row; + mem.addr = offset; + mem.size = size; + mem.buf_p = reinterpret_cast(buf.data()); + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_MEM, + .buffer_size = sizeof(mem), + .buffer = reinterpret_cast(&mem)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); + return size; +} + +void device::write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, + uint32_t reg_val) { + amdxdna_drm_aie_reg reg{}; + reg.col = col; + reg.row = row; + reg.addr = reg_addr; + reg.val = reg_val; + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_REG, + .buffer_size = sizeof(reg), + .buffer = reinterpret_cast(®)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); +} + +std::string read_sysfs(const std::string &filename) { + std::ifstream file(filename); + std::string line; + if (file.is_open()) { + std::getline(file, line); + file.close(); + } else { + std::cerr << "Error opening file: " << filename << std::endl; + line = ""; + } + return line; +} + +std::filesystem::path find_npu_device() { + const std::filesystem::path drvpath = "/sys/bus/pci/drivers/amdxdna"; + for (auto const &dir_entry : std::filesystem::directory_iterator{drvpath}) + if (dir_entry.is_symlink()) { + std::cout << dir_entry.path() << '\n'; + auto actual_path = drvpath / std::filesystem::read_symlink(dir_entry); + auto rel = std::filesystem::relative(actual_path, "/sys/devices"); + if (!rel.empty() && rel.native()[0] != '.') return absolute(actual_path); + } + std::cerr << "No npu device found" << std::endl; + exit(-1); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h new file mode 100644 index 000000000..4b5f224ad --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef PCIE_DEVICE_LINUX_XDNA_H +#define PCIE_DEVICE_LINUX_XDNA_H + +#include +#include + +#include "experimental/xrt_xclbin.h" +#include "fence.h" +#include "shim_debug.h" + +namespace shim_xdna { +struct pdev; +struct bo; + +struct pdev { + mutable std::mutex m_lock; + mutable int m_dev_fd = -1; + mutable std::unique_ptr m_dev_heap_bo; + + pdev(); + ~pdev(); + + void ioctl(unsigned long cmd, void *arg) const; + void *mmap(void *addr, size_t len, int prot, int flags, off_t offset) const; +}; + +struct device { + enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; + + xrt::xclbin m_xclbin; + mutable std::mutex m_mutex; + pdev m_pdev; + + device(); + ~device(); + + xrt::xclbin get_xclbin(const xrt::uuid &xclbin_id) const; + + std::unique_ptr import_bo(int ehdl) const; + const pdev &get_pdev() const; + + std::unique_ptr alloc_bo(void *userptr, uint32_t ctx_id, size_t size, + uint64_t flags); + + std::unique_ptr alloc_bo(size_t size, uint64_t flags); + std::unique_ptr alloc_bo(void *userptr, size_t size, uint64_t flags); + std::unique_ptr import_bo(pid_t, int); + std::unique_ptr create_hw_context( + const xrt::uuid &xclbin_uuid, const std::map &qos); + std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + uint32_t size); + size_t write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + const std::vector &buf); + uint32_t read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr); + void write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, + uint32_t reg_val); + std::unique_ptr create_fence(fence_handle::access_mode); + std::unique_ptr import_fence(pid_t, int); + void record_xclbin(const xrt::xclbin &xclbin); +}; + +std::string read_sysfs(const std::string &filename); +std::filesystem::path find_npu_device(); + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h new file mode 100644 index 000000000..058b68530 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h @@ -0,0 +1,1163 @@ +/* + * Copyright (C) 2019-2022, Xilinx Inc + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + * + * Apache License Verbiage + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * GPL license Verbiage: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. This program is + * distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. You should have received a copy of the + * GNU General Public License along with this program; if not, write + * to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + * + */ + +/** + * DOC: XRT Embedded Runtime definition + * + * Header file *ert.h* defines data structures used by Emebdded Runtime (ERT) + * and XRT xclExecBuf() API. + */ + +#ifndef _ERT_H_ +#define _ERT_H_ + +#if defined(__linux__) && defined(__KERNEL__) +#include +#elif defined(__windows__) && defined(_KERNEL_MODE) +#include +#elif defined(__cplusplus) && !defined(_KERNEL_MODE) +#include +#include +#else +#include +#include +#include +#endif + +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable : 4200 4201) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +#define to_cfg_pkg(pkg) ((struct ert_configure_cmd *)(pkg)) +#define to_start_krnl_pkg(pkg) ((struct ert_start_kernel_cmd *)(pkg)) +#define to_copybo_pkg(pkg) ((struct ert_start_copybo_cmd *)(pkg)) +#define to_cfg_sk_pkg(pkg) ((struct ert_configure_sk_cmd *)(pkg)) +#define to_init_krnl_pkg(pkg) ((struct ert_init_kernel_cmd *)(pkg)) +#define to_validate_pkg(pkg) ((struct ert_validate_cmd *)(pkg)) +#define to_abort_pkg(pkg) ((struct ert_abort_cmd *)(pkg)) + +#define HOST_RW_PATTERN 0xF0F0F0F0 +#define DEVICE_RW_PATTERN 0x0F0F0F0F + +/** + * struct ert_packet: ERT generic packet format + * + * @state: [3-0] current state of a command + * @custom: [11-4] custom per specific commands + * @count: [22-12] number of words in payload (data) + * @opcode: [27-23] opcode identifying specific command + * @type: [31-28] type of command (currently 0) + * @data: count number of words representing packet payload + */ +struct ert_packet { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-28] */ + }; + uint32_t header; + }; + uint32_t data[1]; /* count number of words */ +}; + +/** + * struct ert_start_kernel_cmd: ERT start kernel command format + * + * @state: [3-0] current state of a command + * @stat_enabled: [4] enabled driver to record timestamp for various + * states cmd has gone through. The stat data + * is appended after cmd data. + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header for cmd data. Not + * include stat data. + * @opcode: [27-23] 0, opcode for start_kernel + * @type: [31-27] 0, type of start_kernel + * + * @cu_mask: first mandatory CU mask + * @data: count-1 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, a mandatory CU mask, + * and extra_cu_masks per header field, followed by a CU register map of size + * (count - (1 + extra_cu_masks)) uint32_t words. + */ +struct ert_start_kernel_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t stat_enabled : 1; /* [4] */ + uint32_t unused : 5; /* [9-5] */ + uint32_t extra_cu_masks : 2; /* [11-10] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-1 number of words */ +}; + +/** + * struct ert_dpu_data - interpretation of data payload for ERT_START_DPU + * + * @instruction_buffer: address of instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @chained: number of following ert_dpu_data elements + * + * The ert_dpu_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_dpu_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_DPU is interpreted as fixed instruction + * buffer address along with instruction count, followed by regular kernel + * arguments. + */ +struct ert_dpu_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t chained; /* number of following ert_dpu_data elements */ +}; + +/** + * struct ert_npu_data - interpretation of data payload for ERT_START_NPU + * + * @instruction_buffer: address of instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @instruction_prop_count: WORD length of property name value pairs + * + * The ert_npu_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_npu_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_NPU is interpreted as instruction + * buffer address, instruction count along with instruction property, + * followed by regular kernel arguments. + * + * When instruction_prop_count is non-zero, it indicates the length + * (in 32 bits WORD) of the instruction buffer properties after this + * fields. This count is reserved for future extension. One example + * propertiy is the number of actual columns this instruction used. + */ +struct ert_npu_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t + instruction_prop_count; /* WORD length of following properties nv pairs */ +}; + +/** + * struct ert_npu_preempt_data - interpretation of data payload for + * ERT_START_NPU_PREEMPT + * + * @instruction_buffer: address of instruction buffer + * @save_buffer: address of save instruction buffer + * @restore_buffer: address of restrore instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @save_buffer_size: size of save instruction buffer in bytes + * @restore_buffer_size: size of restore instruction buffer in bytes + * @instruction_prop_count: number of property name value pairs + * + * The ert_npu_preempt_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_npu_preempt_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_NPU_PREEMPT is interpreted as instruction + * buffer, save instruction buffer, restore instruction buffer and their + * size, along with instruction property, followed by regular kernel arguments. + * + * When instruction_prop_count is non-zero, it indicates the length + * (in 32 bits WORD) of the instruction buffer properties after this + * fields. This count is reserved for future extension. One example + * propertiy is the number of actual columns this instruction used. + */ +struct ert_npu_preempt_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint64_t save_buffer; /* buffer address 2 words */ + uint64_t restore_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t save_buffer_size; /* size of buffer in bytes */ + uint32_t restore_buffer_size; /* size of buffer in bytes */ + uint32_t instruction_prop_count; /* DWORD length of following properties nv + pairs */ +}; + +/** + * struct ert_cmd_chain_data - interpretation of data payload for ERT_CMD_CHAIN + * + * @command_count: number of commands in chain + * @submit_index: index of last successfully submitted command in chain + * @error_index: index of failing command if cmd status is not completed + * @data[]: address of each command in chain + * + * This is the payload of an *ert_packet* when the opcode is ERT_CMD_CHAIN + */ +struct ert_cmd_chain_data { + uint32_t command_count; + uint32_t submit_index; + uint32_t error_index; + uint32_t reserved[3]; + uint64_t data[]; +}; + +#ifndef U30_DEBUG +#define ert_write_return_code(cmd, value) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ + skcmd->data[end_idx] = value; \ + } while (0) + +#define ert_read_return_code(cmd, ret) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ + ret = skcmd->data[end_idx]; \ + } while (0) +#else +/* These are for debug legacy U30 firmware */ +#define ert_write_return_code(cmd, value) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + skcmd->cu_mask = value; \ + } while (0) + +#define ert_read_return_code(cmd, ret) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + ret = skcmd->cu_mask; \ + } while (0) +#endif + +/** + * struct ert_init_kernel_cmd: ERT initialize kernel command format + * this command initializes CUs by writing CU registers. CUs are + * represented by cu_mask and extra_cu_masks. + * + * @state: [3-0] current state of a command + * @update_rtp: [4] command is for runtime update of cu argument + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header + * @opcode: [27-23] 0, opcode for init_kernel + * @type: [31-27] 0, type of init_kernel + * + * @cu_run_timeout the configured CU timeout value in Microseconds + * setting to 0 means CU should not timeout + * @cu_reset_timeout the configured CU reset timeout value in Microseconds + * when CU timeout, CU will be reset. this indicates + * CU reset should be completed within the timeout value. + * if cu_run_timeout is set to 0, this field is undefined. + * + * @cu_mask: first mandatory CU mask + * @data: count-9 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, 8 reserved fields, + * a mandatory CU mask, and extra_cu_masks per header field, followed by a + * CU register map of size (count - (9 + extra_cu_masks)) uint32_t words. + */ +struct ert_init_kernel_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t update_rtp : 1; /* [4] */ + uint32_t unused : 5; /* [9-5] */ + uint32_t extra_cu_masks : 2; /* [11-10] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + uint32_t cu_run_timeout; /* CU timeout value in Microseconds */ + uint32_t cu_reset_timeout; /* CU reset timeout value in Microseconds */ + uint32_t reserved[6]; /* reserved for future use */ + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-9 number of words */ +}; + +#define KDMA_BLOCK_SIZE 64 /* Limited by KDMA CU */ +struct ert_start_copybo_cmd { + uint32_t state : 4; /* [3-0], must be ERT_CMD_STATE_NEW */ + uint32_t unused : 6; /* [9-4] */ + uint32_t extra_cu_masks : 2; /* [11-10], = 3 */ + uint32_t count : 11; /* [22-12], = 16, exclude 'arg' */ + uint32_t opcode : 5; /* [27-23], = ERT_START_COPYBO */ + uint32_t type : 4; /* [31-27], = ERT_DEFAULT */ + uint32_t cu_mask[4]; /* mandatory cu masks */ + uint32_t reserved[4]; /* for scheduler use */ + uint32_t src_addr_lo; /* low 32 bit of src addr */ + uint32_t src_addr_hi; /* high 32 bit of src addr */ + uint32_t src_bo_hdl; /* src bo handle, cleared by driver */ + uint32_t dst_addr_lo; /* low 32 bit of dst addr */ + uint32_t dst_addr_hi; /* high 32 bit of dst addr */ + uint32_t dst_bo_hdl; /* dst bo handle, cleared by driver */ + uint32_t size; /* size in bytes low 32 bit*/ + uint32_t size_hi; /* size in bytes high 32 bit*/ + void *arg; /* pointer to aux data for KDS */ +}; + +/** + * struct ert_configure_cmd: ERT configure command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload (5 + num_cus) + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @slot_size: command queue slot size + * @num_cus: number of compute units in program + * @cu_shift: shift value to convert CU idx to CU addr + * @cu_base_addr: base address to add to CU addr for actual physical address + * + * @ert:1 enable embedded HW scheduler + * @polling:1 poll for command completion + * @cu_dma:1 enable CUDMA custom module for HW scheduler + * @cu_isr:1 enable CUISR custom module for HW scheduler + * @cq_int:1 enable interrupt from host to HW scheduler + * @cdma:1 enable CDMA kernel + * @unused:25 + * @dsa52:1 reserved for internal use + * + * @data: addresses of @num_cus CUs + */ +struct ert_configure_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t slot_size; + uint32_t num_cus; + uint32_t cu_shift; + uint32_t cu_base_addr; + + /* features */ + uint32_t ert : 1; + uint32_t polling : 1; + uint32_t cu_dma : 1; + uint32_t cu_isr : 1; + uint32_t cq_int : 1; + uint32_t cdma : 1; + uint32_t dataflow : 1; + /* WORKAROUND: allow xclRegWrite/xclRegRead access shared CU */ + uint32_t rw_shared : 1; + uint32_t kds_30 : 1; + uint32_t dmsg : 1; + uint32_t echo : 1; + uint32_t intr : 1; + uint32_t unusedf : 19; + uint32_t dsa52 : 1; + + /* cu address map size is num_cus */ + uint32_t data[1]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * This data struct is obsoleted. Only used in legacy ERT firmware. + * Use 'struct config_sk_image_uuid' instead on XGQ based ERT. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + */ +struct config_sk_image { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + * @sk_uuid: xclbin uuid that this soft kernel image belones to + */ +struct config_sk_image_uuid { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; + unsigned char sk_uuid[16]; + uint32_t slot_id; +}; + +/** + * struct ert_configure_sk_cmd: ERT configure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @num_image: number of images + */ +struct ert_configure_sk_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t num_image; + struct config_sk_image image[1]; +}; + +/** + * struct ert_unconfigure_sk_cmd: ERT unconfigure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @start_cuidx: start index of compute units + * @num_cus: number of compute units in program + */ +struct ert_unconfigure_sk_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t start_cuidx; + uint32_t num_cus; +}; + +/** + * struct ert_abort_cmd: ERT abort command format. + * + * @exec_bo_handle: The bo handle of execbuf command to abort + */ +struct ert_abort_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint64_t exec_bo_handle; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_validate_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t timestamp; + uint32_t cq_read_single; + uint32_t cq_write_single; + uint32_t cu_read_single; + uint32_t cu_write_single; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_access_valid_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t h2h_access; + uint32_t h2d_access; + uint32_t d2h_access; + uint32_t d2d_access; + uint32_t d2cu_access; + uint32_t wr_count; + uint32_t wr_test; +}; + +/** + * ERT command state + * + * @ERT_CMD_STATE_NEW: Set by host before submitting a command to + * scheduler + * @ERT_CMD_STATE_QUEUED: Internal scheduler state + * @ERT_CMD_STATE_SUBMITTED: Internal scheduler state + * @ERT_CMD_STATE_RUNNING: Internal scheduler state + * @ERT_CMD_STATE_COMPLETED: Set by scheduler when command completes + * @ERT_CMD_STATE_ERROR: Set by scheduler if command failed + * @ERT_CMD_STATE_ABORT: Set by scheduler if command abort + * @ERT_CMD_STATE_TIMEOUT: Set by scheduler if command timeout and reset + * @ERT_CMD_STATE_NORESPONSE: Set by scheduler if command timeout and fail to + * reset + */ +enum ert_cmd_state { + ERT_CMD_STATE_NEW = 1, + ERT_CMD_STATE_QUEUED = 2, + ERT_CMD_STATE_RUNNING = 3, + ERT_CMD_STATE_COMPLETED = 4, + ERT_CMD_STATE_ERROR = 5, + ERT_CMD_STATE_ABORT = 6, + ERT_CMD_STATE_SUBMITTED = 7, + ERT_CMD_STATE_TIMEOUT = 8, + ERT_CMD_STATE_NORESPONSE = 9, + ERT_CMD_STATE_SKERROR = 10, // Check for error return code from Soft Kernel + ERT_CMD_STATE_SKCRASHED = 11, // Soft kernel has crashed + ERT_CMD_STATE_MAX, // Always the last one +}; + +struct cu_cmd_state_timestamps { + uint64_t skc_timestamps[ERT_CMD_STATE_MAX]; // In nano-second +}; + +/** + * Opcode types for commands + * + * @ERT_START_CU: start a workgroup on a CU + * @ERT_START_KERNEL: currently aliased to ERT_START_CU + * @ERT_CONFIGURE: configure command scheduler + * @ERT_EXEC_WRITE: execute a specified CU after writing + * @ERT_CU_STAT: get stats about CU execution + * @ERT_START_COPYBO: start KDMA CU or P2P, may be converted to + * ERT_START_CU before cmd reach to scheduler, short-term hack + * @ERT_SK_CONFIG: configure soft kernel + * @ERT_SK_START: start a soft kernel + * @ERT_SK_UNCONFIG: unconfigure a soft kernel + * @ERT_START_KEY_VAL: same as ERT_START_CU but with key-value pair flavor + * @ERT_START_DPU: instruction buffer command format + * @ERT_CMD_CHAIN: command chain + * @ERT_START_NPU: instruction buffer command format on NPU format + * @ERT_START_NPU_PREEMPT: instruction buffer command with preemption format on + * NPU + */ +enum ert_cmd_opcode { + ERT_START_CU = 0, + ERT_START_KERNEL = 0, + ERT_CONFIGURE = 2, + ERT_EXIT = 3, + ERT_ABORT = 4, + ERT_EXEC_WRITE = 5, + ERT_CU_STAT = 6, + ERT_START_COPYBO = 7, + ERT_SK_CONFIG = 8, + ERT_SK_START = 9, + ERT_SK_UNCONFIG = 10, + ERT_INIT_CU = 11, + ERT_START_FA = 12, + ERT_CLK_CALIB = 13, + ERT_MB_VALIDATE = 14, + ERT_START_KEY_VAL = 15, + ERT_ACCESS_TEST_C = 16, + ERT_ACCESS_TEST = 17, + ERT_START_DPU = 18, + ERT_CMD_CHAIN = 19, + ERT_START_NPU = 20, + ERT_START_NPU_PREEMPT = 21, +}; + +/** + * Command types + * + * @ERT_DEFAULT: default command type + * @ERT_KDS_LOCAL: command processed by KDS locally + * @ERT_CTRL: control command uses reserved command queue slot + * @ERT_CU: compute unit command + */ +enum ert_cmd_type { + ERT_DEFAULT = 0, + ERT_KDS_LOCAL = 1, + ERT_CTRL = 2, + ERT_CU = 3, + ERT_SCU = 4, +}; + +/** + * Soft kernel types + * + * @SOFTKERNEL_TYPE_EXEC: executable + */ +enum softkernel_type { + SOFTKERNEL_TYPE_EXEC = 0, +}; + +/* + * Base address GPIO per spec + * | Offset | Description + * ----------------------- + * | 0x00 | ERT_MGMT_PF_base_addr (Not sure where this should be use) + * | 0x08 | ERT_USER_PF_base_addr. The base address of ERT peripherals + */ +#if defined(ERT_BUILD_V20) +uint32_t ert_base_addr = 0; +#define ERT_BASE_ADDR 0x01F30008 +#endif + +#if defined(ERT_BUILD_V30) +uint32_t ert_base_addr = 0; +#define ERT_BASE_ADDR 0x01F30008 +#endif + +/** + * Address constants per spec + */ +#define ERT_WORD_SIZE 4 /* 4 bytes */ +#define ERT_CQ_SIZE 0x10000 /* 64K */ +#if defined(ERT_BUILD_U50) +#define ERT_CQ_BASE_ADDR 0x340000 +#define ERT_CSR_ADDR 0x360000 +#elif defined(ERT_BUILD_V20) +#define ERT_CQ_BASE_ADDR (0x000000 + ert_base_addr) +#define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#elif defined(ERT_BUILD_V30) +#define ERT_CQ_BASE_ADDR 0x1F60000 +#define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#else +#define ERT_CQ_BASE_ADDR 0x190000 +#define ERT_CSR_ADDR 0x180000 +#endif + +/** + * The STATUS REGISTER is for communicating completed CQ slot indices + * MicroBlaze write, host reads. MB(W) / HOST(COR) + */ +#define ERT_STATUS_REGISTER_ADDR (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR0 (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR1 (ERT_CSR_ADDR + 0x4) +#define ERT_STATUS_REGISTER_ADDR2 (ERT_CSR_ADDR + 0x8) +#define ERT_STATUS_REGISTER_ADDR3 (ERT_CSR_ADDR + 0xC) + +/** + * The CU DMA REGISTER is for communicating which CQ slot is to be started + * on a specific CU. MB selects a free CU on which the command can + * run, then writes the 1<state = ERT_CMD_STATE_NEW; + pkt->extra_cu_masks = 3; + pkt->count = 16; + pkt->opcode = ERT_START_COPYBO; + pkt->type = ERT_DEFAULT; + pkt->cu_mask[0] = 0; + pkt->cu_mask[1] = 0; + pkt->cu_mask[2] = 0; + pkt->cu_mask[3] = 0; + pkt->src_addr_lo = (uint32_t)src_offset; + pkt->src_addr_hi = (src_offset >> 32) & 0xFFFFFFFF; + pkt->src_bo_hdl = src_bo; + pkt->dst_addr_lo = (uint32_t)dst_offset; + pkt->dst_addr_hi = (dst_offset >> 32) & 0xFFFFFFFF; + pkt->dst_bo_hdl = dst_bo; + pkt->size = size; + pkt->size_hi = 0; /* set to 0 explicitly */ + pkt->arg = 0; +} +static inline uint64_t ert_copybo_src_offset(struct ert_start_copybo_cmd *pkt) { + return (uint64_t)pkt->src_addr_hi << 32 | pkt->src_addr_lo; +} +static inline uint64_t ert_copybo_dst_offset(struct ert_start_copybo_cmd *pkt) { + return (uint64_t)pkt->dst_addr_hi << 32 | pkt->dst_addr_lo; +} +static inline uint64_t ert_copybo_size(struct ert_start_copybo_cmd *pkt) { + return pkt->size; +} + +static inline bool ert_valid_opcode(struct ert_packet *pkt) { + struct ert_start_kernel_cmd *skcmd; + struct ert_init_kernel_cmd *ikcmd; + struct ert_start_copybo_cmd *sccmd; + struct ert_configure_cmd *ccmd; + struct ert_configure_sk_cmd *cscmd; + struct ert_cmd_chain_data *ccdata; + bool valid; + + switch (pkt->opcode) { + case ERT_START_CU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 4 registers */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 4); + break; + case ERT_START_DPU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + size (in words) of ert_dpu_data + */ + valid = + (skcmd->count >= 1 + skcmd->extra_cu_masks + + sizeof(struct ert_dpu_data) / sizeof(uint32_t)); + break; + case ERT_CMD_CHAIN: + ccdata = (struct ert_cmd_chain_data *)pkt->data; + /* header count must match number of commands in payload */ + valid = (pkt->count == (ccdata->command_count * sizeof(uint64_t) + + sizeof(struct ert_cmd_chain_data)) / + sizeof(uint32_t)); + break; + case ERT_START_NPU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + ert_npu_data */ + valid = + (skcmd->count >= 1 + skcmd->extra_cu_masks + + sizeof(struct ert_npu_data) / sizeof(uint32_t)); + break; + case ERT_START_NPU_PREEMPT: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + ert_npu_preempt_data */ + valid = (skcmd->count >= + 1 + skcmd->extra_cu_masks + + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t)); + break; + case ERT_START_KEY_VAL: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1); + break; + case ERT_EXEC_WRITE: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 6 registers */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 6); + break; + case ERT_START_FA: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1); + break; + case ERT_SK_START: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 1 control word */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 1); + break; + case ERT_CONFIGURE: + ccmd = to_cfg_pkg(pkt); + /* 5 mandatory fields in struct */ + valid = (ccmd->count >= 5 + ccmd->num_cus); + break; + case ERT_START_COPYBO: + sccmd = to_copybo_pkg(pkt); + valid = (sccmd->count == 16); + break; + case ERT_INIT_CU: + ikcmd = to_init_krnl_pkg(pkt); + /* 9 mandatory words in struct + 4 control registers */ + valid = (ikcmd->count >= ikcmd->extra_cu_masks + 9 + 4); + break; + case ERT_SK_CONFIG: + cscmd = to_cfg_sk_pkg(pkt); + valid = (cscmd->count == + sizeof(struct config_sk_image) * cscmd->num_image / 4 + 1); + break; + case ERT_CLK_CALIB: + case ERT_MB_VALIDATE: + case ERT_ACCESS_TEST_C: + case ERT_CU_STAT: /* TODO: Rules to validate? */ + case ERT_EXIT: + case ERT_ABORT: + valid = true; + break; + case ERT_SK_UNCONFIG: /* NOTE: obsolete */ + default: + valid = false; + } + + return valid; +} + +static inline uint64_t get_ert_packet_size_bytes(struct ert_packet *pkt) { + // header plus payload + return sizeof(pkt->header) + pkt->count * sizeof(uint32_t); +} + +static inline struct ert_dpu_data *get_ert_dpu_data( + struct ert_start_kernel_cmd *pkt) { + if (pkt->opcode != ERT_START_DPU) return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_dpu_data *)(pkt->data + pkt->extra_cu_masks); +} + +static inline struct ert_dpu_data *get_ert_dpu_data_next( + struct ert_dpu_data *dpu_data) { + if (dpu_data->chained == 0) return NULL; + + return dpu_data + 1; +} + +static inline struct ert_cmd_chain_data *get_ert_cmd_chain_data( + struct ert_packet *pkt) { + if (pkt->opcode != ERT_CMD_CHAIN) return NULL; + + return (struct ert_cmd_chain_data *)pkt->data; +} + +static inline struct ert_npu_data *get_ert_npu_data( + struct ert_start_kernel_cmd *pkt) { + if (pkt->opcode != ERT_START_NPU) return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_npu_data *)(pkt->data + pkt->extra_cu_masks); +} + +static inline struct ert_npu_preempt_data *get_ert_npu_preempt_data( + struct ert_start_kernel_cmd *pkt) { + if (pkt->opcode != ERT_START_NPU_PREEMPT) return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_npu_preempt_data *)(pkt->data + pkt->extra_cu_masks); +} + +static inline uint32_t *get_ert_regmap_begin(struct ert_start_kernel_cmd *pkt) { + switch (pkt->opcode) { + case ERT_START_DPU: + return pkt->data + pkt->extra_cu_masks + + (get_ert_dpu_data(pkt)->chained + 1) * + sizeof(struct ert_dpu_data) / sizeof(uint32_t); + + case ERT_START_NPU: + return pkt->data + pkt->extra_cu_masks + + sizeof(struct ert_npu_data) / sizeof(uint32_t) + + get_ert_npu_data(pkt)->instruction_prop_count; + + case ERT_START_NPU_PREEMPT: + return pkt->data + pkt->extra_cu_masks + + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t) + + get_ert_npu_preempt_data(pkt)->instruction_prop_count; + + default: + // skip past embedded extra cu_masks + return pkt->data + pkt->extra_cu_masks; + } +} + +static inline uint32_t *get_ert_regmap_end(struct ert_start_kernel_cmd *pkt) { + // pkt->count includes the mandatory cumask which precededs data array + return &pkt->cu_mask + pkt->count; +} + +static inline uint64_t get_ert_regmap_size_bytes( + struct ert_start_kernel_cmd *pkt) { + return (get_ert_regmap_end(pkt) - get_ert_regmap_begin(pkt)) * + sizeof(uint32_t); +} + +#ifdef __linux__ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +static inline struct cu_cmd_state_timestamps *ert_start_kernel_timestamps( + struct ert_start_kernel_cmd *pkt) { + uint64_t offset = pkt->count * sizeof(uint32_t) + sizeof(pkt->header); + /* Make sure the offset of timestamps are properly aligned. */ + return ( + struct cu_cmd_state_timestamps *)((char *)pkt + + P2ROUNDUP(offset, sizeof(uint64_t))); +} + +/* Return 0 if this pkt doesn't support timestamp or disabled */ +static inline int get_size_with_timestamps_or_zero(struct ert_packet *pkt) { + struct ert_start_kernel_cmd *skcmd; + int size = 0; + + switch (pkt->opcode) { + case ERT_START_CU: + case ERT_EXEC_WRITE: + case ERT_START_FA: + case ERT_SK_START: + skcmd = to_start_krnl_pkg(pkt); + if (skcmd->stat_enabled) { + size = (char *)ert_start_kernel_timestamps(skcmd) - (char *)pkt; + size += sizeof(struct cu_cmd_state_timestamps); + } + } + + return size; +} +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#ifdef _WIN32 +#pragma warning(pop) +#endif + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp new file mode 100644 index 000000000..2f0ea7c14 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fence.h" + +#include + +#include "hwctx.h" + +namespace { + +uint32_t create_syncobj(const shim_xdna::pdev &dev) { + drm_syncobj_create csobj = {.handle = AMDXDNA_INVALID_FENCE_HANDLE, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_CREATE, &csobj); + return csobj.handle; +} + +void destroy_syncobj(const shim_xdna::pdev &dev, uint32_t hdl) { + drm_syncobj_destroy dsobj = {.handle = hdl}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_DESTROY, &dsobj); +} + +uint64_t query_syncobj_timeline(const shim_xdna::pdev &dev, uint32_t sobj_hdl) { + uint64_t point = 0; + drm_syncobj_timeline_array sobjs = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&point), + .count_handles = 1, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_QUERY, &sobjs); + return point; +} + +int export_syncobj(const shim_xdna::pdev &dev, uint32_t sobj_hdl) { + drm_syncobj_handle esobj = { + .handle = sobj_hdl, + .flags = 0, + .fd = -1, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &esobj); + return esobj.fd; +} + +uint32_t import_syncobj(const shim_xdna::pdev &dev, int fd) { + drm_syncobj_handle isobj = { + .handle = AMDXDNA_INVALID_FENCE_HANDLE, + .flags = 0, + .fd = fd, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &isobj); + return isobj.handle; +} + +void signal_syncobj(const shim_xdna::pdev &dev, uint32_t sobj_hdl, + uint64_t timepoint) { + drm_syncobj_timeline_array sobjs = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&timepoint), + .count_handles = 1, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &sobjs); +} + +void wait_syncobj_done(const shim_xdna::pdev &dev, uint32_t sobj_hdl, + uint64_t timepoint) { + drm_syncobj_timeline_wait wsobj = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&timepoint), + .timeout_nsec = std::numeric_limits::max(), /* wait forever */ + .count_handles = 1, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); +} + +void wait_syncobj_available(const shim_xdna::pdev &dev, + const uint32_t *sobj_hdls, + const uint64_t *timepoints, uint32_t num) { + drm_syncobj_timeline_wait wsobj = { + .handles = reinterpret_cast(sobj_hdls), + .points = reinterpret_cast(timepoints), + .timeout_nsec = std::numeric_limits::max(), /* wait forever */ + .count_handles = num, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); +} + +void submit_wait_syncobjs(const shim_xdna::pdev &dev, + const shim_xdna::hw_ctx *ctx, + const uint32_t *sobj_hdls, const uint64_t *points, + uint32_t num) { + wait_syncobj_available(dev, sobj_hdls, points, num); + + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->m_handle, + .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY, + .cmd_handles = reinterpret_cast(sobj_hdls), + .args = reinterpret_cast(points), + .cmd_count = num, + .arg_count = num, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); +} + +void submit_signal_syncobj(const shim_xdna::pdev &dev, + const shim_xdna::hw_ctx *ctx, uint32_t sobj_hdl, + uint64_t point) { + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->m_handle, + .type = AMDXDNA_CMD_SUBMIT_SIGNAL, + .cmd_handles = sobj_hdl, + .args = point, + .cmd_count = 1, + .arg_count = 1, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); +} + +} // namespace + +namespace shim_xdna { + +fence_handle::fence_handle(const device &device) + : m_pdev(device.get_pdev()), + m_import(std::make_unique(-1)), + m_syncobj_hdl(create_syncobj(m_pdev)) { + shim_debug("Fence allocated: %d@%d", m_syncobj_hdl, m_state); +} + +fence_handle::fence_handle(const device &device, int ehdl) + : m_pdev(device.get_pdev()), + m_import(std::make_unique(ehdl)), + m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())) { + shim_debug("Fence imported: %d@%ld", m_syncobj_hdl, m_state); +} + +fence_handle::fence_handle(const fence_handle &f) + : m_pdev(f.m_pdev), + m_import(f.share_handle()), + m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())), + m_signaled{f.m_signaled}, + m_state{f.m_state} { + shim_debug("Fence cloned: %d@%ld", m_syncobj_hdl, m_state); +} + +fence_handle::~fence_handle() { + shim_debug("Fence going away: %d@%ld", m_syncobj_hdl, m_state); + try { + destroy_syncobj(m_pdev, m_syncobj_hdl); + } catch (const std::system_error &e) { + shim_debug("Failed to destroy fence_handle"); + } +} + +std::unique_ptr fence_handle::share_handle() const { + if (m_state != initial_state) + shim_err(-EINVAL, "Can't share fence_handle not at initial state."); + + return std::make_unique(export_syncobj(m_pdev, m_syncobj_hdl)); +} + +uint64_t fence_handle::get_next_state() const { return m_state + 1; } + +std::unique_ptr fence_handle::clone() const { + return std::make_unique(*this); +} + +uint64_t fence_handle::wait_next_state() const { + std::lock_guard guard(m_lock); + + if (m_state != initial_state && m_signaled) + shim_err(-EINVAL, + "Can't wait on fence_handle that has been signaled before."); + return ++m_state; +} + +// Timeout value is ignored for now. +void fence_handle::wait(uint32_t timeout_ms) const { + auto st = signal_next_state(); + shim_debug("Waiting for command fence_handle %d@%ld", m_syncobj_hdl, st); + wait_syncobj_done(m_pdev, m_syncobj_hdl, st); +} + +void fence_handle::submit_wait(const hw_ctx *ctx) const { + auto st = signal_next_state(); + shim_debug("Submitting wait for command fence_handle %d@%ld", m_syncobj_hdl, + st); + submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1); +} + +uint64_t fence_handle::signal_next_state() const { + std::lock_guard guard(m_lock); + + if (m_state != initial_state && !m_signaled) + shim_err(-EINVAL, "Can't signal fence_handle that has been waited before."); + if (m_state == initial_state) m_signaled = true; + return ++m_state; +} + +void fence_handle::signal() const { + auto st = signal_next_state(); + shim_debug("Signaling command fence_handle %d@%ld", m_syncobj_hdl, st); + signal_syncobj(m_pdev, m_syncobj_hdl, st); +} + +void fence_handle::submit_signal(const hw_ctx *ctx) const { + auto st = signal_next_state(); + shim_debug("Submitting signal command fence_handle %d@%ld", m_syncobj_hdl, + st); + submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st); +} + +void fence_handle::submit_wait( + const pdev &dev, const hw_ctx *ctx, + const std::vector &fences) { + constexpr int max_fences = 1024; + uint32_t hdls[max_fences]; + uint64_t pts[max_fences]; + int i = 0; + + if (fences.size() > max_fences) + shim_err(-EINVAL, "Too many fences in one submit: %d", fences.size()); + + for (auto f : fences) { + auto fh = static_cast(f); + auto st = fh->wait_next_state(); + shim_debug("Waiting for command fence_handle %d@%ld", fh->m_syncobj_hdl, + st); + hdls[i] = fh->m_syncobj_hdl; + pts[i] = st; + i++; + } + submit_wait_syncobjs(dev, ctx, hdls, pts, i); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h new file mode 100644 index 000000000..d650adb7a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _FENCE_XDNA_H_ +#define _FENCE_XDNA_H_ + +#include +#include + +#include "shim_debug.h" + +namespace shim_xdna { +struct pdev; +struct device; +struct hw_ctx; + +struct shared_handle { + shared_handle(int fd) : m_fd(fd) {} + + ~shared_handle() { + if (m_fd != -1) close(m_fd); + } + int get_export_handle() const { return m_fd; } + + const int m_fd; +}; + +struct fence_handle { + using export_handle = int; + const pdev &m_pdev; + const std::unique_ptr m_import; + uint32_t m_syncobj_hdl; + // Protecting below mutables + mutable std::mutex m_lock; + // Set once at first signal + mutable bool m_signaled = false; + // Ever incrementing at each wait/signal + static constexpr uint64_t initial_state = 0; + mutable uint64_t m_state = initial_state; + enum class access_mode : uint8_t { local, shared, process, hybrid }; + + fence_handle(const device &device); + fence_handle(const device &device, int ehdl); + fence_handle(const fence_handle &); + ~fence_handle(); + + std::unique_ptr clone() const; + std::unique_ptr share_handle() const; + void wait(uint32_t timeout_ms) const; + uint64_t get_next_state() const; + void signal() const; + void submit_wait(const hw_ctx *) const; + static void submit_wait(const pdev &dev, const hw_ctx *, + const std::vector &fences); + void submit_signal(const hw_ctx *) const; + uint64_t wait_next_state() const; + uint64_t signal_next_state() const; +}; + +} // namespace shim_xdna + +#endif // _FENCE_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp new file mode 100644 index 000000000..a81300001 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "hwctx.h" + +#include +#include + +#include "bo.h" +#include "core/common/api/xclbin_int.h" +#include "hwq.h" + +namespace { + +std::vector get_pdi(const xrt_core::xclbin::aie_partition_obj &aie, + uint16_t kernel_id) { + for (auto &pdi : aie.pdis) { + for (auto &cdo : pdi.cdo_groups) { + for (auto kid : cdo.kernel_ids) { + if (kid == kernel_id) return pdi.pdi; + } + } + } + shim_xdna::shim_err(ENOENT, "PDI for kernel ID 0x%x not found", kernel_id); +} + +} // namespace + +namespace shim_xdna { + +hw_ctx::hw_ctx(device &dev, const qos_t &qos, std::unique_ptr q, + const xrt::xclbin &xclbin) + : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { + shim_debug("Creating HW context..."); + + for (auto &[key, value] : qos) { + if (key == "gops") + m_qos.gops = value; + else if (key == "fps") + m_qos.fps = value; + else if (key == "dma_bandwidth") + m_qos.dma_bandwidth = value; + else if (key == "latency") + m_qos.latency = value; + else if (key == "frame_execution_time") + m_qos.frame_exec_time = value; + else if (key == "priority") + m_qos.priority = value; + } + + auto axlf = xclbin.get_axlf(); + auto aie_partition = xrt_core::xclbin::get_aie_partition(axlf); + + for (const auto &k : xclbin.get_kernels()) { + auto &props = xrt_core::xclbin_int::get_properties(k); + try { + for (const auto &cu : k.get_cus()) { + m_cu_info.push_back({.m_name = cu.get_name(), + .m_func = props.functional, + .m_pdi = get_pdi(aie_partition, props.kernel_id)}); + } + } catch (std::system_error &ex) { + if (ex.code().value() != ENOENT) throw; + shim_debug("%s", ex.what()); + } + } + + if (m_cu_info.empty()) + shim_err(EINVAL, "No valid DPU kernel found in xclbin"); + m_ops_per_cycle = aie_partition.ops_per_cycle; + m_num_cols = aie_partition.ncol; +} + +hw_ctx::~hw_ctx() { + try { + delete_ctx_on_device(); + } catch (const std::system_error &e) { + shim_debug("Failed to delete context on device: %s", e.what()); + } + shim_debug("Destroyed HW context (%d)...", m_handle); + shim_debug("Destroying KMQ HW context (%d)...", m_handle); +} + +cuidx_type hw_ctx::open_cu_context(const std::string &cu_name) { + for (uint32_t i = 0; i < m_cu_info.size(); i++) { + auto &ci = m_cu_info[i]; + if (ci.m_name == cu_name) return cuidx_type{.index = i}; + } + + shim_err(ENOENT, "CU name (%s) not found", cu_name.c_str()); +} + +std::unique_ptr hw_ctx::alloc_bo(size_t size, uint64_t flags) { + return alloc_bo(nullptr, size, flags); +} + +std::unique_ptr hw_ctx::import_bo(pid_t pid, int ehdl) { + // const_cast: import_bo() is not const yet in device class + return m_device.import_bo(pid, ehdl); +} + +hw_q *hw_ctx::get_hw_queue() const { return m_q.get(); } + +void hw_ctx::create_ctx_on_device() { + amdxdna_drm_create_hwctx arg = {}; + arg.qos_p = reinterpret_cast(&m_qos); + arg.umq_bo = m_q->m_queue_boh; + arg.max_opc = m_ops_per_cycle; + // TODO(max) + // throw std::runtime_error("TODO(max): core_rows"); + // arg.num_tiles = m_num_cols * + // xrt_core::device_query(&m_device).core_rows; + arg.num_tiles = m_num_cols * 4; + arg.log_buf_bo = + m_log_bo ? m_log_bo->get_drm_bo_handle() : AMDXDNA_INVALID_BO_HANDLE; + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &arg); + + m_handle = arg.handle; + m_doorbell = arg.umq_doorbell; + + m_q->bind_hwctx(this); +} + +void hw_ctx::delete_ctx_on_device() { + if (m_handle == AMDXDNA_INVALID_CTX_HANDLE) return; + + m_q->unbind_hwctx(); + amdxdna_drm_destroy_hwctx arg = {}; + arg.handle = m_handle; + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &arg); + + fini_log_buf(); +} + +void hw_ctx::init_log_buf() { + auto log_buf_size = m_num_cols * 1024; + m_log_bo = alloc_bo(nullptr, log_buf_size, XCL_BO_FLAGS_EXECBUF); + m_log_buf = m_log_bo->map(map_type::write); + std::memset(m_log_buf, 0, log_buf_size); +} + +void hw_ctx::fini_log_buf() const { + if (m_log_bo) m_log_bo->unmap(m_log_buf); +} + +hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, const qos_t &qos) + : hw_ctx(device, qos, std::make_unique(device), xclbin) { + create_ctx_on_device(); + std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + + m_cu_info.size() * + sizeof(amdxdna_cu_config)); + auto cu_conf_param = reinterpret_cast( + cu_conf_param_buf.data()); + + cu_conf_param->num_cus = m_cu_info.size(); + xcl_bo_flags f = {}; + f.flags = XRT_BO_FLAGS_CACHEABLE; + for (int i = 0; i < m_cu_info.size(); i++) { + auto &ci = m_cu_info[i]; + + m_pdi_bos.push_back(alloc_bo(nullptr, ci.m_pdi.size(), f.all)); + auto &pdi_bo = m_pdi_bos[i]; + auto pdi_vaddr = reinterpret_cast(pdi_bo->map(map_type::write)); + + // see cu_configs[1] in amdxdna_hwctx_param_config_cu + assert(i < 1 && "only 1 CU supported"); + auto &cf = cu_conf_param->cu_configs[i]; + std::memcpy(pdi_vaddr, ci.m_pdi.data(), ci.m_pdi.size()); + pdi_bo->sync(direction::host2device, pdi_bo->get_properties().size, 0); + cf.cu_bo = pdi_bo.get()->get_drm_bo_handle(); + cf.cu_func = ci.m_func; + } + + amdxdna_drm_config_hwctx arg = {}; + arg.handle = m_handle; + arg.param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU; + arg.param_val = reinterpret_cast(cu_conf_param); + arg.param_val_size = cu_conf_param_buf.size(); + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &arg); + + shim_debug("Created KMQ HW context (%d)", m_handle); +} + +std::unique_ptr hw_ctx::alloc_bo(void *userptr, size_t size, + uint64_t flags) { + // const_cast: alloc_bo() is not const yet in device class + // Debug buffer is specific to one context. + if (xcl_bo_flags{flags}.use == XRT_BO_USE_DEBUG) + return m_device.alloc_bo(userptr, m_handle, size, flags); + // Other BOs are shared across all contexts. + return m_device.alloc_bo(userptr, AMDXDNA_INVALID_CTX_HANDLE, size, flags); +} + +std::unique_ptr create_hw_context(device &dev, + const xrt::xclbin &xclbin, + const hw_ctx::qos_t &qos) { + return std::make_unique(dev, xclbin, qos); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h new file mode 100644 index 000000000..1f2a62277 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _HWCTX_XDNA_H_ +#define _HWCTX_XDNA_H_ + +#include + +#include "amdxdna_accel.h" +#include "device.h" +#include "shim_debug.h" + +namespace shim_xdna { + +struct hw_q; +struct bo; +struct device; + +struct cu_info { + std::string m_name; + size_t m_func; + std::vector m_pdi; +}; + +struct cuidx_type { + union { + std::uint32_t index; + struct { + std::uint16_t domain_index; // [15-0] + std::uint16_t domain; // [31-16] + }; + }; + + // Ensure consistent use of domain and index types + using domain_type = uint16_t; + using domain_index_type = uint16_t; +}; + +struct hw_ctx { + using qos_t = std::map; + enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; + device &m_device; + uint32_t m_handle = AMDXDNA_INVALID_CTX_HANDLE; + amdxdna_qos_info m_qos = {}; + std::vector m_cu_info; + std::unique_ptr m_q; + uint32_t m_ops_per_cycle; + uint32_t m_num_cols; + uint32_t m_doorbell; + std::unique_ptr m_log_bo; + void *m_log_buf; + std::vector> m_pdi_bos; + + hw_ctx(device &dev, const qos_t &qos, std::unique_ptr q, + const xrt::xclbin &xclbin); + hw_ctx(device &dev, const xrt::xclbin &xclbin, const qos_t &qos); + ~hw_ctx(); + + // TODO + std::unique_ptr alloc_bo(void *userptr, size_t size, uint64_t flags); + std::unique_ptr alloc_bo(size_t size, uint64_t flags); + std::unique_ptr import_bo(pid_t, int); + cuidx_type open_cu_context(const std::string &cuname); + void create_ctx_on_device(); + void init_log_buf(); + void fini_log_buf() const; + void delete_ctx_on_device(); + + hw_q *get_hw_queue() const; +}; + +std::unique_ptr create_hw_context( + device &dev, const xrt::xclbin &xclbin, + const std::map &qos); + +} // namespace shim_xdna + +#endif // _HWCTX_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp new file mode 100644 index 000000000..d41b7c47e --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "hwq.h" + +#include "bo.h" +#include "ert.h" +#include "fence.h" +#include "shim_debug.h" + +namespace { + +ert_packet *get_chained_command_pkt(shim_xdna::bo *boh) { + auto cmdpkt = + reinterpret_cast(boh->map(shim_xdna::map_type::write)); + return cmdpkt->opcode == ERT_CMD_CHAIN ? cmdpkt : nullptr; +} + +int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, + shim_xdna::bo *cmd, uint32_t timeout_ms) { + int ret = 1; + auto id = cmd->get_cmd_id(); + + shim_xdna::shim_debug("Waiting for cmd (%ld)...", id); + + amdxdna_drm_wait_cmd wcmd = { + .hwctx = ctx->m_handle, + .timeout = timeout_ms, + .seq = id, + }; + + try { + pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd); + } catch (const std::system_error &ex) { + if (ex.code().value() != ETIME) throw; + ret = 0; + } + return ret; +} + +} // namespace + +namespace shim_xdna { + +hw_q::hw_q(const device &device) + : m_hwctx(nullptr), + m_pdev(device.get_pdev()), + m_queue_boh(AMDXDNA_INVALID_BO_HANDLE) { + shim_debug("Created KMQ HW queue"); +} + +void hw_q::bind_hwctx(const hw_ctx *ctx) { + m_hwctx = ctx; + shim_debug("Bond HW queue to HW context %d", m_hwctx->m_handle); +} + +void hw_q::unbind_hwctx() { + shim_debug("Unbond HW queue from HW context %d", m_hwctx->m_handle); + m_hwctx = nullptr; +} + +int hw_q::wait_command(bo *cmd, uint32_t timeout_ms) const { + if (poll_command(cmd)) return 1; + return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms); +} + +void hw_q::submit_wait(const fence_handle *f) { f->submit_wait(m_hwctx); } + +void hw_q::submit_wait(const std::vector &fences) { + fence_handle::submit_wait(m_pdev, m_hwctx, fences); +} + +void hw_q::submit_signal(const fence_handle *f) { f->submit_signal(m_hwctx); } + +hw_q::~hw_q() { shim_debug("Destroying KMQ HW queue"); } + +void hw_q::issue_command(bo *cmd_bo) { + // Assuming 1024 max args per cmd bo + const size_t max_arg_bos = 1024; + + uint32_t arg_bo_hdls[max_arg_bos]; + uint32_t cmd_bo_hdl = cmd_bo->get_drm_bo_handle(); + + amdxdna_drm_exec_cmd ecmd = { + .hwctx = m_hwctx->m_handle, + .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF, + .cmd_handles = cmd_bo_hdl, + .args = reinterpret_cast(arg_bo_hdls), + .cmd_count = 1, + .arg_count = cmd_bo->get_arg_bo_handles(arg_bo_hdls, max_arg_bos), + }; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); + + auto id = ecmd.seq; + cmd_bo->set_cmd_id(id); + shim_debug("Submitted command (%ld)", id); +} + +int poll_command(bo *cmd) { + auto cmdpkt = reinterpret_cast(cmd->map(map_type::write)); + + if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) { + return 1; + } + return 0; +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h new file mode 100644 index 000000000..5c85f46ab --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _HWQ_XDNA_H_ +#define _HWQ_XDNA_H_ + +#include "fence.h" +#include "hwctx.h" + +namespace shim_xdna { +struct bo; +struct hw_q { + const hw_ctx *m_hwctx; + const pdev &m_pdev; + uint32_t m_queue_boh; + + hw_q(const device &device); + ~hw_q(); + + int wait_command(bo *, uint32_t timeout_ms) const; + void submit_wait(const fence_handle *); + void submit_wait(const std::vector &); + void submit_signal(const fence_handle *); + void bind_hwctx(const hw_ctx *ctx); + void unbind_hwctx(); + void issue_command(bo *); +}; + +int poll_command(bo *); + +} // namespace shim_xdna + +#endif // _HWQ_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp new file mode 100644 index 000000000..698b1a59b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp @@ -0,0 +1,34 @@ +// +// Created by mlevental on 10/3/24. +// + +#include "shim_debug.h" + +#include +#include +#include + +static std::recursive_mutex s_debug_mutex; + +struct debug_lock { + std::lock_guard m_lk; + debug_lock(); +}; + +debug_lock::debug_lock() : m_lk(s_debug_mutex) {} + +unsigned long time_ns() { + static auto zero = std::chrono::high_resolution_clock::now(); + auto now = std::chrono::high_resolution_clock::now(); + auto integral_duration = + std::chrono::duration_cast(now - zero).count(); + return static_cast(integral_duration); +} + +void debugf(const char *format, ...) { + debug_lock lk; + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h new file mode 100644 index 000000000..0e9cbd93e --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef SHIM_DEBUG_H +#define SHIM_DEBUG_H + +#include + +#include +#include +#include + +void debugf(const char *format, ...); + +namespace shim_xdna { + +template +[[noreturn]] void shim_err(int err, const char *fmt, Args &&...args) { + std::string format = std::string(fmt); + format += " (err=%d)"; + int sz = std::snprintf(nullptr, 0, format.c_str(), args..., err) + 1; + if (sz <= 0) + throw std::system_error(sz, std::system_category(), + "could not format error string"); + + auto size = static_cast(sz); + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args..., err); + throw std::system_error(err, std::system_category(), std::string(buf.get())); +} + +[[noreturn]] inline void shim_not_supported_err(const char *msg) { + shim_err(ENOTSUP, msg); +} + +template +void shim_debug(const char *fmt, Args &&...args) { +#ifndef NDEBUG + std::string format{"shim_xdna: "}; + format += std::string(fmt); + format += "\n"; + debugf(format.c_str(), std::forward(args)...); +#endif +} + +} // namespace shim_xdna + +#endif // SHIM_DEBUG_H From 12a9844b6edfa0fd9c036fd2ad2d0598136badca Mon Sep 17 00:00:00 2001 From: makslevental Date: Thu, 10 Oct 2024 11:14:55 -0400 Subject: [PATCH 06/35] impl allocator --- .github/workflows/ci-linux.yml | 7 + build_tools/build_test_cpp.sh | 11 +- .../driver/xrt-lite/CMakeLists.txt | 7 +- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 359 ++++++++++++++++++ .../iree-amd-aie/driver/xrt-lite/allocator.h | 19 + .../iree-amd-aie/driver/xrt-lite/buffer.cc | 82 ++++ .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 23 ++ .../iree-amd-aie/driver/xrt-lite/device.cc | 31 +- .../iree-amd-aie/driver/xrt-lite/driver.cc | 1 - .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 39 +- .../driver/xrt-lite/shim/linux/kmq/bo.h | 59 +-- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 20 +- .../driver/xrt-lite/shim/linux/kmq/device.h | 12 +- .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 9 + .../driver/xrt-lite/shim/linux/kmq/fence.h | 13 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 29 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 6 +- .../driver/xrt-lite/shim/linux/kmq/xrt_mem.h | 147 +++++++ .../src/iree-amd-aie/driver/xrt-lite/util.h | 13 + 19 files changed, 751 insertions(+), 136 deletions(-) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h create mode 100755 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index cc647669b..a98608d9b 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -212,3 +212,10 @@ jobs: --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ --peano-install-dir=$PWD/llvm-aie + + - name: XRT-LITE tests + run: | + DEVICE_TEST_DIR="$PWD/iree-install/device_tests" + for t in $(ls $DEVICE_TEST_DIR); do + $DEVICE_TEST_DIR/$t + done diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index 857c81f44..e4a0a661e 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -149,8 +149,11 @@ elif [[ "$OSTYPE" == "darwin"* ]]; then fi if [ -d "$llvm_install_dir" ]; then - cp "$llvm_install_dir"/bin/lld "$install_dir"/bin - cp "$llvm_install_dir"/bin/FileCheck "$install_dir"/bin - cp "$llvm_install_dir"/bin/not "$install_dir"/bin + cp "$llvm_install_dir/bin/lld" "$install_dir/bin" + cp "$llvm_install_dir/bin/FileCheck" "$install_dir/bin" + cp "$llvm_install_dir/bin/not" "$install_dir/bin" fi -cp "$build_dir"/tools/testing/e2e/iree-e2e-matmul-test "$install_dir"/bin + +cp "$build_dir/tools/testing/e2e/iree-e2e-matmul-test" "$install_dir/bin" +mkdir -p "$install_dir/device_tests" +cp "$build_dir"/runtime/plugins/AMD-AIE/iree-amd-aie/driver/xrt-lite/cts/*test "$install_dir/device_tests" diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index 0fdb39b87..6916d3cc9 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -21,6 +21,10 @@ iree_cc_library( xrt-lite SRCS api.h + allocator.cc + allocator.h + buffer.cc + buffer.h device.cc driver.cc util.h @@ -30,8 +34,5 @@ iree_cc_library( iree::base::internal::flatcc::parsing iree-amd-aie::schemas::xrt_executable_def_c_fbs iree-amd-aie::driver::xrt-lite::shim::linux::kmq::shim-xdna - COPTS - $<$:-fexceptions -frtti> - $<$:/EHsc /GR> PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc new file mode 100644 index 000000000..78478375b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -0,0 +1,359 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/allocator.h" + +#include "iree-amd-aie/driver/xrt-lite/buffer.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" + +// TODO(null): use one ID per address space or pool - each shows as a different +// track in tracing tools. +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING +static const char* IREE_HAL_XRT_LITE_ALLOCATOR_ID = "XRT-LITE unpooled"; +#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING + +namespace { +extern const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable; +} + +struct iree_hal_xrt_lite_allocator { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + std::shared_ptr shim_device; + IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) + + iree_hal_xrt_lite_allocator(iree_allocator_t host_allocator, + std::shared_ptr shim_device) + : host_allocator(host_allocator), shim_device(shim_device) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_resource_initialize(&iree_hal_xrt_lite_allocator_vtable, + &this->resource); + // TODO(null): query device heaps, supported features (concurrent + // access/etc), and prepare any pools that will be used during allocation. + // It's expected that most failures that occur after creation are allocation + // request-specific so preparing here will help keep the errors more + // localized. + IREE_TRACE_ZONE_END(z0); + } + + ~iree_hal_xrt_lite_allocator() = default; + + iree_status_t trim() { + // TODO(null): if the allocator is retaining any unused resources they + // should be dropped here. If the underlying implementation has pools or + // caches it should be notified that a trim is requested. This is called in + // low-memory situations or when IREE is not going to be used for awhile + // (low power modes or suspension). + (void)this; + + return iree_ok_status(); + } + + void query_statistics(iree_hal_allocator_statistics_t* out_statistics) { + IREE_STATISTICS({ + memcpy(out_statistics, &this->statistics, sizeof(*out_statistics)); + // TODO(null): update statistics (merge). + }); + } + + iree_status_t query_memory_heaps(iree_host_size_t capacity, + iree_hal_allocator_memory_heap_t* heaps, + iree_host_size_t* out_count) { + // TODO(null): return heap information. This is called at least once with a + // capacity that may be 0 (indicating a query for the total count) and the + // heaps should only be populated if capacity is sufficient to store all of + // them. + (void)this; + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "heap query not implemented"); + return status; + } + + iree_hal_buffer_compatibility_t query_buffer_compatibility( + iree_hal_buffer_params_t* params, iree_device_size_t* allocation_size) { + // TODO(null): set compatibility rules based on the implementation. + // Note that the user may have requested that the allocator place the + // allocation based on whatever is optimal for the indicated usage by + // including the IREE_HAL_MEMORY_TYPE_OPTIMAL flag. It's still required that + // the implementation meet all the requirements but it is free to place it + // in either host or device memory so long as the appropriate bits are + // updated to indicate where it landed. + (void)this; + + // All buffers can be allocated on the heap. + iree_hal_buffer_compatibility_t compatibility = + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; + + if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + } + + // Buffers can only be used on the queue if they are device visible. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + if (iree_any_bit_set(params->usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; + } + } + + // We are now optimal. + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; + + // Guard against the corner case where the requested buffer size is 0. The + // application is unlikely to do anything when requesting a 0-byte buffer; + // but it can happen in real world use cases. So we should at least not + // crash. + if (*allocation_size == 0) *allocation_size = 4; + // Align allocation sizes to 4 bytes so shaders operating on 32 bit types + // can act safely even on buffer ranges that are not naturally aligned. + *allocation_size = iree_host_align(*allocation_size, 4); + + return compatibility; + } + + iree_status_t allocate_buffer(const iree_hal_buffer_params_t* params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** out_buffer) { + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_hal_buffer_compatibility_t compatibility = + this->query_buffer_compatibility(&compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { + // TODO(benvanik): make a helper for this. +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + // TODO(null): allocate the underlying device memory. + // The impl_ptr is just used for accounting and can be an opaque value + // (handle/etc) so long as it is consistent between the alloc and free and + // unique to the buffer while it is live. An example + // iree_hal_xrt_lite_buffer_wrap is provided that can be used for + // implementations that are managing memory using underlying allocators and + // just wrapping those device pointers in the HAL buffer type. Other + // implementations that require more tracking can provide their own buffer + // types that do such tracking for them. + (void)this; + + iree_hal_buffer_t* buffer = nullptr; + shim_xcl_bo_flags f = {}; + f.flags = XCL_BO_FLAGS_HOST_ONLY; + f.extension = 0; + std::unique_ptr bo = + shim_device->alloc_bo(allocation_size, f); + iree_status_t status = iree_hal_xrt_lite_buffer_wrap( + std::move(bo), reinterpret_cast(this), + compat_params.type, compat_params.access, compat_params.usage, + allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, + iree_hal_buffer_release_callback_null(), this->host_allocator, &buffer); + + if (iree_status_is_ok(status)) { + // TODO(null): ensure this accounting is balanced in deallocate_buffer. + // IREE_TRACE_ALLOC_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr, + // allocation_size); + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( + &this->statistics, compat_params.type, allocation_size)); + *out_buffer = buffer; + } else { + iree_hal_buffer_release(buffer); + } + return status; + } + + void deallocate_buffer(iree_hal_buffer_t* base_buffer) { + // TODO(null): free the underlying device memory here. Buffers allocated + // from this allocator will call this method to handle cleanup. Note that + // because this method is responsible for doing the base + // iree_hal_buffer_destroy and the caller assumes the memory has been freed + // an implementation could pool the buffer handle and return it in the + // future. + + // TODO(null): if the buffer was imported then this accounting may need to + // be conditional depending on the implementation. + bool was_imported = false; + if (!was_imported) { + // TODO(max): + // IREE_TRACE_FREE_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr); + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( + &this->statistics, iree_hal_buffer_memory_type(base_buffer), + iree_hal_buffer_allocation_size(base_buffer))); + } + + iree_hal_buffer_destroy(base_buffer); + } + + iree_status_t import_buffer( + const iree_hal_buffer_params_t* params, + iree_hal_external_buffer_t* external_buffer, + iree_hal_buffer_release_callback_t release_callback, + iree_hal_buffer_t** out_buffer) { + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_device_size_t allocation_size = external_buffer->size; + iree_hal_buffer_compatibility_t compatibility = + this->query_buffer_compatibility(&compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { + // TODO(benvanik): make a helper for this. +#if IREE_STATUS_MODE + iree_bitfield_string_temp_t temp0, temp1, temp2; + iree_string_view_t memory_type_str = + iree_hal_memory_type_format(params->type, &temp0); + iree_string_view_t usage_str = + iree_hal_buffer_usage_format(params->usage, &temp1); + iree_string_view_t compatibility_str = + iree_hal_buffer_compatibility_format(compatibility, &temp2); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters; " + "memory_type=%.*s, usage=%.*s, compatibility=%.*s", + (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, + usage_str.data, (int)compatibility_str.size, compatibility_str.data); +#else + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot import a buffer with the given parameters"); +#endif // IREE_STATUS_MODE + } + + // TODO(null): switch on external_buffer->type and import the buffer. See + // the headers for more information on semantics. Most implementations can + // service IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION by just wrapping + // the underlying device pointer. Those that can service + // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION may be able to avoid a lot + // of additional copies when moving data around between host and device or + // across devices from different drivers. + (void)this; + iree_status_t status = iree_make_status( + IREE_STATUS_UNIMPLEMENTED, "external buffer type not supported"); + + return status; + } + + iree_status_t export_buffer(iree_hal_buffer_t* buffer, + iree_hal_external_buffer_type_t requested_type, + iree_hal_external_buffer_flags_t requested_flags, + iree_hal_external_buffer_t* out_external_buffer) { + // TODO(null): switch on requested_type and export as appropriate. Most + // implementations can service + // IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION by just exposing the + // underlying device pointer. Those that can service + // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION may be able to avoid a lot + // of additional copies when moving data around between host and device or + // across devices from different drivers. + (void)this; + return iree_make_status(IREE_STATUS_UNAVAILABLE, + "external buffer type not supported"); + } +}; + +static iree_hal_xrt_lite_allocator* iree_hal_xrt_lite_allocator_cast( + iree_hal_allocator_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_allocator_vtable); + return reinterpret_cast(base_value); +} + +iree_status_t iree_hal_xrt_lite_allocator_create( + iree_allocator_t host_allocator, std::shared_ptr device, + iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(out_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_allocator* allocator = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*allocator), + reinterpret_cast(&allocator))); + allocator = + new (allocator) iree_hal_xrt_lite_allocator(host_allocator, device); + iree_status_t status = iree_ok_status(); + + if (iree_status_is_ok(status)) { + *out_allocator = reinterpret_cast(allocator); + } else { + iree_hal_allocator_release( + reinterpret_cast(allocator)); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_xrt_lite_allocator_destroy( + iree_hal_allocator_t* base_allocator) { + IREE_ASSERT_ARGUMENT(base_allocator); + iree_hal_xrt_lite_allocator* allocator = + iree_hal_xrt_lite_allocator_cast(base_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + // TODO(max): shouldn't this be happening automatically via the refcounting + // (or just the dtor of device?) + allocator->shim_device.reset(); + iree_hal_resource_release(&allocator->resource); + // something's not happening here? + iree_allocator_free(allocator->host_allocator, allocator); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( + const iree_hal_allocator_t* base_allocator) { + const iree_hal_xrt_lite_allocator* allocator = + reinterpret_cast(base_allocator); + return allocator->host_allocator; +} + +MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, trim) +MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + query_statistics) +MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + query_memory_heaps) +MEMBER_WRAPPER(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + query_buffer_compatibility, iree_hal_buffer_compatibility_t) +MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + allocate_buffer) +MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + deallocate_buffer) +MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + import_buffer) +MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, + export_buffer) + +namespace { +const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { + .destroy = iree_hal_xrt_lite_allocator_destroy, + .host_allocator = iree_hal_xrt_lite_allocator_host_allocator, + .trim = iree_hal_xrt_lite_allocator_trim, + .query_statistics = iree_hal_xrt_lite_allocator_query_statistics, + .query_memory_heaps = iree_hal_xrt_lite_allocator_query_memory_heaps, + .query_buffer_compatibility = + iree_hal_xrt_lite_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_xrt_lite_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_xrt_lite_allocator_deallocate_buffer, + .import_buffer = iree_hal_xrt_lite_allocator_import_buffer, + .export_buffer = iree_hal_xrt_lite_allocator_export_buffer, +}; + +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h new file mode 100644 index 000000000..630bcdab3 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h @@ -0,0 +1,19 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ +#define IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +// Creates a buffer allocator used for persistent allocations. +iree_status_t iree_hal_xrt_lite_allocator_create( + iree_allocator_t host_allocator, std::shared_ptr device, + iree_hal_allocator_t** out_allocator); + +#endif // IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc new file mode 100644 index 000000000..780e3d64a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -0,0 +1,82 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/buffer.h" + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" + +struct iree_hal_xrt_lite_buffer_t { + iree_hal_buffer_t base; + std::unique_ptr bo; + iree_hal_buffer_release_callback_t release_callback; +}; + +namespace { +extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; +} + +iree_status_t iree_hal_xrt_lite_buffer_wrap( + std::unique_ptr bo, iree_hal_allocator_t* allocator, + iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(out_buffer); + *out_buffer = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_buffer_t* buffer = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, + iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer)); + iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, + allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, + &iree_hal_xrt_lite_buffer_vtable, &buffer->base); + buffer->release_callback = release_callback; + // TODO(null): retain or take ownership of provided handles/pointers/etc. + // Implementations may want to pass in an internal buffer type discriminator + // if there are multiple or use different top-level iree_hal_buffer_t + // implementations. + buffer->bo = std::move(bo); + *out_buffer = &buffer->base; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { + iree_hal_xrt_lite_buffer_t* buffer = + reinterpret_cast(base_buffer); + iree_allocator_t host_allocator = base_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + // Optionally call a release callback when the buffer is destroyed. Not all + // implementations may require this but it's cheap and provides additional + // flexibility. + if (buffer->release_callback.fn) { + buffer->release_callback.fn(buffer->release_callback.user_data, + base_buffer); + } + + buffer->bo.reset(); + iree_allocator_free(host_allocator, buffer); + + IREE_TRACE_ZONE_END(z0); +} + +namespace { +const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable = { + .recycle = iree_hal_buffer_recycle, + .destroy = iree_hal_xrt_lite_buffer_destroy, + .map_range = unimplemented, + .unmap_range = unimplemented, + .invalidate_range = unimplemented, + .flush_range = unimplemented, +}; +} \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h new file mode 100644 index 000000000..31849a30d --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -0,0 +1,23 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ +#define IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +// Wraps an allocation in an iree_hal_buffer_t. +iree_status_t iree_hal_xrt_lite_buffer_wrap( + std::unique_ptr bo, iree_hal_allocator_t* allocator, + iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); + +#endif // IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index ec4366945..9b3f77d75 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -6,6 +6,7 @@ #include "iree-amd-aie/driver/xrt-lite/device.h" +#include "iree-amd-aie/driver/xrt-lite/allocator.h" #include "iree-amd-aie/driver/xrt-lite/api.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" @@ -13,6 +14,8 @@ struct iree_hal_xrt_lite_device_t { iree_hal_resource_t resource; iree_string_view_t identifier; iree_allocator_t host_allocator; + // not used + iree_hal_allocator_t* device_allocator; std::shared_ptr shim_device; }; @@ -41,32 +44,27 @@ iree_status_t iree_hal_xrt_lite_device_create( iree_hal_xrt_lite_device_t* device = nullptr; iree_host_size_t total_size = sizeof(*device) + identifier.size; IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_allocator_malloc(host_allocator, total_size, (void**)&device)); + z0, iree_allocator_malloc(host_allocator, total_size, + reinterpret_cast(&device))); iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &device->resource); iree_string_view_append_to_buffer( identifier, &device->identifier, reinterpret_cast(device) + total_size - identifier.size); device->host_allocator = host_allocator; + device->shim_device = std::make_shared(); // TODO(null): pass device handles and pool configuration to the allocator. // Some implementations may share allocators across multiple devices created // from the same driver. - // TODO(max): - // iree_status_t status = iree_hal_xrt_lite_allocator_create( - // host_allocator, &device->device_allocator); - // TOOD(max): device id - - device->shim_device = std::make_shared(); - - iree_status_t status = iree_ok_status(); - + iree_status_t status = iree_hal_xrt_lite_allocator_create( + host_allocator, device->shim_device, &device->device_allocator); + // TODO(max): device id + *out_device = reinterpret_cast(device); if (iree_status_is_ok(status)) { - *out_device = reinterpret_cast(device); } else { iree_hal_device_release(reinterpret_cast(device)); } - IREE_TRACE_ZONE_END(z0); return status; } @@ -96,6 +94,7 @@ static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { // implementation performs internal async operations those should be shutdown // and joined first. + iree_hal_allocator_release(device->device_allocator); device->shim_device.reset(); iree_allocator_free(host_allocator, device); @@ -109,10 +108,18 @@ static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( return device->host_allocator; } +static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( + iree_hal_device_t* base_device) { + iree_hal_xrt_lite_device_t* device = + iree_hal_xrt_lite_device_cast(base_device); + return device->device_allocator; +} + namespace { const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { .destroy = iree_hal_xrt_lite_device_destroy, .id = iree_hal_xrt_lite_device_id, .host_allocator = iree_hal_xrt_lite_device_host_allocator, + .device_allocator = iree_hal_xrt_lite_device_device_allocator, }; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 3143e6208..6dc67be78 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -5,7 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/driver/xrt-lite/api.h" -#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "util.h" typedef struct iree_hal_xrt_lite_driver_t { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index 7756865a1..6adf21546 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -9,14 +9,15 @@ #include #include "shim_debug.h" +#include "xrt_mem.h" namespace { uint32_t alloc_drm_bo(const shim_xdna::pdev &dev, amdxdna_bo_type type, - void *buf, size_t size) { + size_t size) { amdxdna_drm_create_bo cbo = { .type = static_cast(type), - .vaddr = reinterpret_cast(buf), + .vaddr = reinterpret_cast(nullptr), .size = size, }; dev.ioctl(DRM_IOCTL_AMDXDNA_CREATE_BO, &cbo); @@ -105,9 +106,8 @@ void *addr_align(void *p, size_t align) { return reinterpret_cast((uintptr_t)p + align & ~(align - 1)); } -amdxdna_bo_type flag_to_type(uint64_t bo_flags) { - auto flags = xcl_bo_flags{bo_flags}; - auto boflags = (static_cast(flags.boflags) << 24); +amdxdna_bo_type flag_to_type(shim_xcl_bo_flags flags) { + uint32_t boflags = (static_cast(flags.boflags) << 24); switch (boflags) { case XCL_BO_FLAGS_NONE: case XCL_BO_FLAGS_HOST_ONLY: @@ -192,7 +192,7 @@ std::string bo::type_to_name() const { case AMDXDNA_BO_DEV_HEAP: return {"AMDXDNA_BO_DEV_HEAP"}; case AMDXDNA_BO_DEV: - if (xcl_bo_flags{m_flags}.use == XRT_BO_USE_DEBUG) + if (shim_xcl_bo_flags{m_flags}.use == XRT_BO_USE_DEBUG) return {"AMDXDNA_BO_DEV_DEBUG"}; return {"AMDXDNA_BO_DEV"}; case AMDXDNA_BO_CMD: @@ -251,14 +251,6 @@ void bo::munmap_bo() { if (m_parent) unmap_drm_bo(m_pdev, m_parent, m_parent_size); } -void bo::alloc_bo() { - uint32_t boh = alloc_drm_bo(m_pdev, m_type, nullptr, m_aligned_size); - - amdxdna_drm_get_bo_info bo_info = {}; - get_drm_bo_info(m_pdev, boh, &bo_info); - m_drm_bo = std::make_unique(*this, bo_info); -} - void bo::import_bo() { uint32_t boh = import_drm_bo(m_pdev, m_import, &m_type, &m_aligned_size); @@ -269,13 +261,13 @@ void bo::import_bo() { void bo::free_bo() { m_drm_bo.reset(); } -bo::bo(const pdev &p, uint32_t ctx_id, size_t size, uint64_t flags) +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags) : bo(p, ctx_id, size, flags, flag_to_type(flags)) { if (m_type == AMDXDNA_BO_INVALID) shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); } -bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, uint64_t flags, +bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, amdxdna_bo_type type) : m_pdev(pdev), m_aligned_size(size), @@ -288,18 +280,25 @@ bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, uint64_t flags, if (m_type == AMDXDNA_BO_DEV_HEAP) align = 64 * 1024 * 1024; // Device mem heap must align at 64MB boundary. - alloc_bo(); + uint32_t boh = alloc_drm_bo(m_pdev, m_type, m_aligned_size); + // TODO(max): this is dumb? performs an ioctl right after we just made one? + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_drm_bo = std::make_unique(*this, bo_info); + mmap_bo(align); // Newly allocated buffer may contain dirty pages. If used as output buffer, // the data in cacheline will be flushed onto memory and pollute the output // from device. We perform a cache flush right after the BO is allocated to // avoid this issue. - if (m_type == AMDXDNA_BO_SHMEM) sync(direction::host2device, size, 0); + if (m_type == AMDXDNA_BO_SHMEM) { + sync(direction::host2device, size, 0); + } attach_to_ctx(); #ifndef NDEBUG - switch (m_flags) { + switch (m_flags.all) { case 0x0: shim_debug("allocating dev heap"); break; @@ -353,7 +352,7 @@ bo::~bo() { } bo::bo(const pdev &p, size_t size, amdxdna_bo_type type) - : bo(p, AMDXDNA_INVALID_CTX_HANDLE, size, 0, type) {} + : bo(p, AMDXDNA_INVALID_CTX_HANDLE, size, shim_xcl_bo_flags{}, type) {} properties bo::get_properties() const { return {m_flags, m_aligned_size, get_paddr(), get_drm_bo_handle()}; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h index 73567f262..f48849845 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -15,49 +15,6 @@ namespace shim_xdna { #define XRT_BO_USE_NORMAL 0 #define XRT_BO_USE_DEBUG 1 -/** - * XCL BO Flags bits layout - * - * bits 0 ~ 15: DDR BANK index - * bits 24 ~ 31: BO flags - */ -#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) -#define XCL_BO_FLAGS_NONE (0) -#define XCL_BO_FLAGS_CACHEABLE (1U << 24) -#define XCL_BO_FLAGS_KERNBUF (1U << 25) -#define XCL_BO_FLAGS_SGL (1U << 26) -#define XCL_BO_FLAGS_SVM (1U << 27) -#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) -#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) -#define XCL_BO_FLAGS_P2P (1U << 30) -#define XCL_BO_FLAGS_EXECBUF (1U << 31) - -/** - * Encoding of flags passed to xcl buffer allocation APIs - */ -struct xcl_bo_flags { - union { - uint64_t all; // [63-0] - - struct { - uint32_t flags; // [31-0] - uint32_t extension; // [63-32] - }; - - struct { - uint16_t bank; // [15-0] - uint8_t slot; // [23-16] - uint8_t boflags; // [31-24] - - // extension - uint32_t access : 2; // [33-32] - uint32_t dir : 2; // [35-34] - uint32_t use : 1; // [36] - uint32_t unused : 27; // [63-35] - }; - }; -}; - // map_type - determines how a buffer is mapped enum class map_type { read, write }; @@ -76,10 +33,10 @@ enum class direction { // properties - buffer details struct properties { - uint64_t flags; // flags of bo - uint64_t size; // size of bo - uint64_t paddr; // physical address - uint64_t kmhdl; // kernel mode handle + shim_xcl_bo_flags flags; // flags of bo + uint64_t size; // size of bo + uint64_t paddr; // physical address + uint64_t kmhdl; // kernel mode handle }; struct drm_bo { @@ -99,7 +56,7 @@ struct bo { void *m_aligned = nullptr; size_t m_parent_size = 0; size_t m_aligned_size = 0; - uint64_t m_flags = 0; + shim_xcl_bo_flags m_flags{}; amdxdna_bo_type m_type = AMDXDNA_BO_INVALID; std::unique_ptr m_drm_bo; const shared_handle m_import; @@ -115,9 +72,9 @@ struct bo { // among all HW contexts. uint32_t m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; - bo(const pdev &p, uint32_t ctx_id, size_t size, uint64_t flags, + bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, amdxdna_bo_type type); - bo(const pdev &p, uint32_t ctx_id, size_t size, uint64_t flags); + bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags); bo(const pdev &p, int ehdl); // Support BO creation from internal bo(const pdev &p, size_t size, amdxdna_bo_type type); @@ -137,8 +94,6 @@ struct bo { // DRM BO managed by driver. void bind_at(size_t pos, const bo *bh, size_t offset, size_t size); std::string describe() const; - // Alloc DRM BO from driver - void alloc_bo(); // Import DRM BO from m_import shared object void import_bo(); // Free DRM BO in driver diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 191741170..eeaa6eedc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -17,6 +16,8 @@ #include "bo.h" #include "fence.h" #include "hwctx.h" +#include "shim_debug.h" +#include "xrt_mem.h" namespace { @@ -158,13 +159,13 @@ std::unique_ptr device::create_hw_context( return std::make_unique(*this, get_xclbin(xclbin_uuid), qos); } -std::unique_ptr device::alloc_bo(size_t size, uint64_t flags) { - return alloc_bo(nullptr, size, flags); +std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, + shim_xcl_bo_flags flags) { + return std::make_unique(this->m_pdev, ctx_id, size, flags); } -std::unique_ptr device::alloc_bo(void *userptr, size_t size, - uint64_t flags) { - return alloc_bo(userptr, AMDXDNA_INVALID_CTX_HANDLE, size, flags); +std::unique_ptr device::alloc_bo(size_t size, shim_xcl_bo_flags flags) { + return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); } std::unique_ptr device::import_bo(pid_t pid, int ehdl) { @@ -184,13 +185,6 @@ void device::record_xclbin(const xrt::xclbin &xclbin) { m_xclbin = xclbin; } -std::unique_ptr device::alloc_bo(void *userptr, uint32_t ctx_id, - size_t size, uint64_t flags) { - if (userptr) shim_not_supported_err("User ptr BO"); - - return std::make_unique(this->m_pdev, ctx_id, size, flags); -} - std::unique_ptr device::import_bo(int ehdl) const { return std::make_unique(this->m_pdev, ehdl); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index 4b5f224ad..bad30ba4f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -9,7 +9,7 @@ #include "experimental/xrt_xclbin.h" #include "fence.h" -#include "shim_debug.h" +#include "xrt_mem.h" namespace shim_xdna { struct pdev; @@ -42,12 +42,11 @@ struct device { std::unique_ptr import_bo(int ehdl) const; const pdev &get_pdev() const; - std::unique_ptr alloc_bo(void *userptr, uint32_t ctx_id, size_t size, - uint64_t flags); - - std::unique_ptr alloc_bo(size_t size, uint64_t flags); - std::unique_ptr alloc_bo(void *userptr, size_t size, uint64_t flags); + std::unique_ptr alloc_bo(uint32_t ctx_id, size_t size, + shim_xcl_bo_flags flags); + std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); + std::unique_ptr create_hw_context( const xrt::uuid &xclbin_uuid, const std::map &qos); std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, @@ -57,6 +56,7 @@ struct device { uint32_t read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr); void write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, uint32_t reg_val); + std::unique_ptr create_fence(fence_handle::access_mode); std::unique_ptr import_fence(pid_t, int); void record_xclbin(const xrt::xclbin &xclbin); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp index 2f0ea7c14..55df59e27 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -3,9 +3,12 @@ #include "fence.h" +#include + #include #include "hwctx.h" +#include "shim_debug.h" namespace { @@ -124,6 +127,12 @@ void submit_signal_syncobj(const shim_xdna::pdev &dev, namespace shim_xdna { +shared_handle::~shared_handle() { + if (m_fd != -1) close(m_fd); +} + +int shared_handle::get_export_handle() const { return m_fd; } + fence_handle::fence_handle(const device &device) : m_pdev(device.get_pdev()), m_import(std::make_unique(-1)), diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h index d650adb7a..842b85c2d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h @@ -4,25 +4,20 @@ #ifndef _FENCE_XDNA_H_ #define _FENCE_XDNA_H_ +#include #include #include -#include "shim_debug.h" - namespace shim_xdna { struct pdev; struct device; struct hw_ctx; struct shared_handle { - shared_handle(int fd) : m_fd(fd) {} - - ~shared_handle() { - if (m_fd != -1) close(m_fd); - } - int get_export_handle() const { return m_fd; } - const int m_fd; + shared_handle(int fd) : m_fd(fd) {} + ~shared_handle(); + int get_export_handle() const; }; struct fence_handle { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index a81300001..e533a26d8 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -9,6 +9,7 @@ #include "bo.h" #include "core/common/api/xclbin_int.h" #include "hwq.h" +#include "shim_debug.h" namespace { @@ -90,7 +91,7 @@ cuidx_type hw_ctx::open_cu_context(const std::string &cu_name) { shim_err(ENOENT, "CU name (%s) not found", cu_name.c_str()); } -std::unique_ptr hw_ctx::alloc_bo(size_t size, uint64_t flags) { +std::unique_ptr hw_ctx::alloc_bo(size_t size, shim_xcl_bo_flags flags) { return alloc_bo(nullptr, size, flags); } @@ -134,7 +135,9 @@ void hw_ctx::delete_ctx_on_device() { void hw_ctx::init_log_buf() { auto log_buf_size = m_num_cols * 1024; - m_log_bo = alloc_bo(nullptr, log_buf_size, XCL_BO_FLAGS_EXECBUF); + shim_xcl_bo_flags f; + f.flags = XCL_BO_FLAGS_EXECBUF; + m_log_bo = alloc_bo(nullptr, log_buf_size, f); m_log_buf = m_log_bo->map(map_type::write); std::memset(m_log_buf, 0, log_buf_size); } @@ -153,21 +156,21 @@ hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, const qos_t &qos) cu_conf_param_buf.data()); cu_conf_param->num_cus = m_cu_info.size(); - xcl_bo_flags f = {}; + shim_xcl_bo_flags f = {}; f.flags = XRT_BO_FLAGS_CACHEABLE; for (int i = 0; i < m_cu_info.size(); i++) { - auto &ci = m_cu_info[i]; + cu_info &ci = m_cu_info[i]; - m_pdi_bos.push_back(alloc_bo(nullptr, ci.m_pdi.size(), f.all)); - auto &pdi_bo = m_pdi_bos[i]; - auto pdi_vaddr = reinterpret_cast(pdi_bo->map(map_type::write)); + m_pdi_bos.push_back(alloc_bo(ci.m_pdi.size(), f)); + std::unique_ptr &pdi_bo = m_pdi_bos[i]; + char *pdi_vaddr = reinterpret_cast(pdi_bo->map(map_type::write)); // see cu_configs[1] in amdxdna_hwctx_param_config_cu assert(i < 1 && "only 1 CU supported"); - auto &cf = cu_conf_param->cu_configs[i]; + amdxdna_cu_config &cf = cu_conf_param->cu_configs[i]; std::memcpy(pdi_vaddr, ci.m_pdi.data(), ci.m_pdi.size()); pdi_bo->sync(direction::host2device, pdi_bo->get_properties().size, 0); - cf.cu_bo = pdi_bo.get()->get_drm_bo_handle(); + cf.cu_bo = pdi_bo->get_drm_bo_handle(); cf.cu_func = ci.m_func; } @@ -182,13 +185,13 @@ hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, const qos_t &qos) } std::unique_ptr hw_ctx::alloc_bo(void *userptr, size_t size, - uint64_t flags) { + shim_xcl_bo_flags flags) { // const_cast: alloc_bo() is not const yet in device class // Debug buffer is specific to one context. - if (xcl_bo_flags{flags}.use == XRT_BO_USE_DEBUG) - return m_device.alloc_bo(userptr, m_handle, size, flags); + if (flags.use == XRT_BO_USE_DEBUG) + return m_device.alloc_bo(m_handle, size, flags); // Other BOs are shared across all contexts. - return m_device.alloc_bo(userptr, AMDXDNA_INVALID_CTX_HANDLE, size, flags); + return m_device.alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); } std::unique_ptr create_hw_context(device &dev, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index 1f2a62277..0d25824f1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -8,7 +8,6 @@ #include "amdxdna_accel.h" #include "device.h" -#include "shim_debug.h" namespace shim_xdna { @@ -57,8 +56,9 @@ struct hw_ctx { ~hw_ctx(); // TODO - std::unique_ptr alloc_bo(void *userptr, size_t size, uint64_t flags); - std::unique_ptr alloc_bo(size_t size, uint64_t flags); + std::unique_ptr alloc_bo(void *userptr, size_t size, + shim_xcl_bo_flags flags); + std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); cuidx_type open_cu_context(const std::string &cuname); void create_ctx_on_device(); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h new file mode 100755 index 000000000..d7286bcd0 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2019-2022, Xilinx Inc - All rights reserved. + * Xilinx Runtime (XRT) APIs + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may + * not use this file except in compliance with the License. A copy of the + * License is located at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * GPL license Verbiage: + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) any + * later version. This program is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + * Public License for more details. You should have received a copy of the GNU + * General Public License along with this program; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + */ + +#ifndef _SHIM_MEM_H_ +#define _SHIM_MEM_H_ + +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable : 4201) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +#ifdef __cplusplus +#include +extern "C" { +#else +#if defined(__KERNEL__) +#include +#else +#include +#endif +#endif + +/** + * Encoding of flags passed to xcl buffer allocation APIs + */ +struct shim_xcl_bo_flags { + union { + uint64_t all; // [63-0] + + struct { + uint32_t flags; // [31-0] + uint32_t extension; // [63-32] + }; + + struct { + uint16_t bank; // [15-0] + uint8_t slot; // [23-16] + uint8_t boflags; // [31-24] + + // extension + uint32_t access : 2; // [33-32] + uint32_t dir : 2; // [35-34] + uint32_t use : 1; // [36] + uint32_t unused : 27; // [63-35] + }; + }; +}; + +/** + * XCL BO Flags bits layout + * + * bits 0 ~ 15: DDR BANK index + * bits 24 ~ 31: BO flags + */ +#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) +#define XCL_BO_FLAGS_NONE (0) +#define XCL_BO_FLAGS_CACHEABLE (1U << 24) +#define XCL_BO_FLAGS_KERNBUF (1U << 25) +#define XCL_BO_FLAGS_SGL (1U << 26) +#define XCL_BO_FLAGS_SVM (1U << 27) +#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) +#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) +#define XCL_BO_FLAGS_P2P (1U << 30) +#define XCL_BO_FLAGS_EXECBUF (1U << 31) + +/** + * Shim level BO Flags for extension + */ +#define XRT_BO_ACCESS_LOCAL 0 +#define XRT_BO_ACCESS_SHARED 1 +#define XRT_BO_ACCESS_PROCESS 2 +#define XRT_BO_ACCESS_HYBRID 3 + +/** + * Shim level BO Flags for direction of data transfer + * as seen from device. + */ +#define XRT_BO_ACCESS_READ (1U << 0) +#define XRT_BO_ACCESS_WRITE (1U << 1) +#define XRT_BO_ACCESS_READ_WRITE (XRT_BO_ACCESS_READ | XRT_BO_ACCESS_WRITE) + +/** + * Shim level BO Flags to distinguish use of BO + * + * The use flag is for internal use only. A debug BO + * is supported only on some platforms to communicate + * data from driver / firmware back to user space. + */ +#define XRT_BO_USE_NORMAL 0 +#define XRT_BO_USE_DEBUG 1 + +/** + * XRT Native BO flags + * + * These flags are simple aliases for use with XRT native BO APIs. + */ +#define XRT_BO_FLAGS_NONE XCL_BO_FLAGS_NONE +#define XRT_BO_FLAGS_CACHEABLE XCL_BO_FLAGS_CACHEABLE +#define XRT_BO_FLAGS_DEV_ONLY XCL_BO_FLAGS_DEV_ONLY +#define XRT_BO_FLAGS_HOST_ONLY XCL_BO_FLAGS_HOST_ONLY +#define XRT_BO_FLAGS_P2P XCL_BO_FLAGS_P2P +#define XRT_BO_FLAGS_SVM XCL_BO_FLAGS_SVM + +#ifdef __cplusplus +} +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#ifdef _WIN32 +#pragma warning(pop) +#endif + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h index 70b4c88c0..92556fcd9 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -19,4 +19,17 @@ void unimplemented(Params...) { IREE_ASSERT(false && "unimplemented"); } +#define MEMBER_WRAPPER(From, To, member, return_t) \ + template \ + static return_t To##_##member(From* b, Args... args) { \ + auto* obj = reinterpret_cast(b); \ + return obj->member(args...); \ + } + +#define MEMBER_WRAPPER_STATUS(From, To, member) \ + MEMBER_WRAPPER(From, To, member, iree_status_t) + +#define MEMBER_WRAPPER_VOID(From, To, member) \ + MEMBER_WRAPPER(From, To, member, void) + #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H From b953f914e32ca1339eac0400fcee65c385fc4b0e Mon Sep 17 00:00:00 2001 From: makslevental Date: Thu, 10 Oct 2024 17:02:29 -0400 Subject: [PATCH 07/35] buffer impl --- .github/workflows/ci-linux.yml | 1 + .../iree-amd-aie/driver/xrt-lite/allocator.cc | 32 +++--- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 105 ++++++++++++++++-- .../driver/xrt-lite/cts/CMakeLists.txt | 2 +- 4 files changed, 116 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index a98608d9b..7f1cd1f56 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -215,6 +215,7 @@ jobs: - name: XRT-LITE tests run: | + source /opt/xilinx/xrt/setup.sh DEVICE_TEST_DIR="$PWD/iree-install/device_tests" for t in $(ls $DEVICE_TEST_DIR); do $DEVICE_TEST_DIR/$t diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 78478375b..7c41c5f84 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -325,21 +325,23 @@ static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( return allocator->host_allocator; } -MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, trim) -MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - query_statistics) -MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - query_memory_heaps) -MEMBER_WRAPPER(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - query_buffer_compatibility, iree_hal_buffer_compatibility_t) -MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - allocate_buffer) -MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - deallocate_buffer) -MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - import_buffer) -MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, - export_buffer) +#define ALLOCATOR_MEMBER(member, return_t) \ + MEMBER_WRAPPER(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, member, \ + return_t) +#define ALLOCATOR_MEMBER_STATUS(member) \ + MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, \ + member) +#define ALLOCATOR_MEMBER_VOID(member) \ + MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, member) + +ALLOCATOR_MEMBER_STATUS(trim) +ALLOCATOR_MEMBER_VOID(query_statistics) +ALLOCATOR_MEMBER_STATUS(query_memory_heaps) +ALLOCATOR_MEMBER(query_buffer_compatibility, iree_hal_buffer_compatibility_t) +ALLOCATOR_MEMBER_STATUS(allocate_buffer) +ALLOCATOR_MEMBER_VOID(deallocate_buffer) +ALLOCATOR_MEMBER_STATUS(import_buffer) +ALLOCATOR_MEMBER_STATUS(export_buffer) namespace { const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 780e3d64a..a41e12abf 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -9,10 +9,91 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -struct iree_hal_xrt_lite_buffer_t { +struct iree_hal_xrt_lite_buffer { iree_hal_buffer_t base; std::unique_ptr bo; iree_hal_buffer_release_callback_t release_callback; + + iree_status_t map_range(iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type( + reinterpret_cast(this)), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( + iree_hal_buffer_allowed_usage( + reinterpret_cast(this)), + mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT + ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT + : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + + // TODO(null): perform mapping as described. Note that local-to-buffer range + // adjustment may be required. The resulting mapping is populated with + // standard information such as contents indicating the host addressable + // memory range of the mapped buffer and implementation-specific information + // if additional resources are required. iree_hal_buffer_emulated_map_range + // can be used by implementations that have no way of providing host + // pointers at a large cost (alloc + device->host transfer on map and + // host->device transfer + dealloc on umap). Try not to use that. + void* host_ptr = this->bo->map(shim_xdna::map_type::write); + IREE_ASSERT(host_ptr != + nullptr); // Should be guaranteed by previous checks. + uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; + iree_status_t status = + this->invalidate_range(local_byte_offset, local_byte_length); + // If we mapped for discard scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for heap + // buffers we could reallocate them such that ASAN yells, but that would + // only work if the entire buffer was discarded. +#ifndef NDEBUG + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); + } +#endif // !NDEBUG + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + return status; + } + + iree_status_t unmap_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + // TODO(null): reverse of map_range. Note that cache invalidation is + // explicit via invalidate_range and need not be performed here. If using + // emulated mapping this must call iree_hal_buffer_emulated_unmap_range to + // release the transient resources. + return this->flush_range(local_byte_offset, local_byte_length); + } + + iree_status_t invalidate_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // TODO(null): invalidate the range if required by the buffer. Writes on the + // device are expected to be visible to the host after this returns. + if (IREE_UNLIKELY(!this->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); + } + this->bo->sync(shim_xdna::direction::device2host, local_byte_length, + local_byte_offset); + return iree_ok_status(); + } + + iree_status_t flush_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // TODO(null): flush the range if required by the buffer. Writes on the + // host are expected to be visible to the device after this returns. + if (IREE_UNLIKELY(!this->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); + } + this->bo->sync(shim_xdna::direction::host2device, local_byte_length, + local_byte_offset); + return iree_ok_status(); + } }; namespace { @@ -30,7 +111,7 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( *out_buffer = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_buffer_t* buffer = nullptr; + iree_hal_xrt_lite_buffer* buffer = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer)); @@ -51,8 +132,8 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( } static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { - iree_hal_xrt_lite_buffer_t* buffer = - reinterpret_cast(base_buffer); + iree_hal_xrt_lite_buffer* buffer = + reinterpret_cast(base_buffer); iree_allocator_t host_allocator = base_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); @@ -70,13 +151,21 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { IREE_TRACE_ZONE_END(z0); } +#define BUFFER_MEMBER_STATUS(member) \ + MEMBER_WRAPPER_STATUS(iree_hal_buffer_t, iree_hal_xrt_lite_buffer, member) + +BUFFER_MEMBER_STATUS(map_range) +BUFFER_MEMBER_STATUS(unmap_range) +BUFFER_MEMBER_STATUS(invalidate_range) +BUFFER_MEMBER_STATUS(flush_range) + namespace { const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable = { .recycle = iree_hal_buffer_recycle, .destroy = iree_hal_xrt_lite_buffer_destroy, - .map_range = unimplemented, - .unmap_range = unimplemented, - .invalidate_range = unimplemented, - .flush_range = unimplemented, + .map_range = iree_hal_xrt_lite_buffer_map_range, + .unmap_range = iree_hal_xrt_lite_buffer_unmap_range, + .invalidate_range = iree_hal_xrt_lite_buffer_invalidate_range, + .flush_range = iree_hal_xrt_lite_buffer_flush_range, }; } \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index e18221bda..4bd029265 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -21,7 +21,7 @@ iree_hal_cts_test_suite( iree-amd-aie::driver::xrt-lite::registration INCLUDED_TESTS "allocator" -# "buffer_mapping" + "buffer_mapping" # "command_buffer" "driver" ) From d1072ee361459dd947bb04a42b267527c3f62491 Mon Sep 17 00:00:00 2001 From: makslevental Date: Thu, 10 Oct 2024 19:05:00 -0400 Subject: [PATCH 08/35] executable acche impl --- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 6 +- iree_runtime_plugin.cmake | 10 +- .../driver/xrt-lite/CMakeLists.txt | 8 +- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 23 +- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 10 +- .../driver/xrt-lite/command_buffer.cc | 334 ++++++++++++++++++ .../driver/xrt-lite/command_buffer.h | 25 ++ .../driver/xrt-lite/cts/CMakeLists.txt | 148 ++++---- ...est.cc => command_buffer_dispatch_test.cc} | 45 ++- .../xrt-lite/cts/executable_cache_test.cc | 85 +++++ .../iree-amd-aie/driver/xrt-lite/device.cc | 56 ++- .../driver/xrt-lite/executable.cc | 273 ++++++++++++++ .../iree-amd-aie/driver/xrt-lite/executable.h | 51 +++ .../driver/xrt-lite/nop_executable_cache.cc | 98 +++++ .../driver/xrt-lite/nop_executable_cache.h | 33 ++ .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 125 ++++++- .../driver/xrt-lite/shim/linux/kmq/bo.h | 37 +- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 24 +- .../driver/xrt-lite/shim/linux/kmq/device.h | 9 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 21 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 13 +- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 6 +- 22 files changed, 1275 insertions(+), 165 deletions(-) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h rename runtime/src/iree-amd-aie/driver/xrt-lite/cts/{xrt_lite_command_buffer_dispatch_test.cc => command_buffer_dispatch_test.cc} (79%) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/executable.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index 13024aa11..a1d155269 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -263,8 +263,10 @@ std::vector makeChessEnv(Path &vitisDir, Path path(::getenv("PATH")); Path lnx64o = aieToolsPath / "lib" / "lnx64.o"; Path dotLib = aieToolsPath / "lnx64" / "tools" / "dot" / "lib"; - Path ldLibraryPath(::getenv("LD_LIBRARY_PATH")); - + Path ldLibraryPath; + if (char *ldLibraryPath_ = ::getenv("LD_LIBRARY_PATH")) { + ldLibraryPath = ldLibraryPath_; + } std::string pathEnv = "PATH=" + chessccPath.string() + std::string{sys::EnvPathSeparator} + path.string(); std::string ldLibEnv = "LD_LIBRARY_PATH=" + lnx64o.string() + diff --git a/iree_runtime_plugin.cmake b/iree_runtime_plugin.cmake index 3b168ebc3..594ca4ca0 100644 --- a/iree_runtime_plugin.cmake +++ b/iree_runtime_plugin.cmake @@ -21,16 +21,16 @@ if("xrt" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) set(IREE_AMD_AIE_ENABLE_XRT_DRIVER ON) endif() -if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) - include(iree_aie_xrt) - include(iree_aie_bootgen) -endif() - set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER OFF) if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) message(STATUS "Enabling XRT-LITE build because it is an enabled HAL driver") set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) endif() +if(IREE_AMD_AIE_ENABLE_XRT_DRIVER OR IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) + include(iree_aie_xrt) + include(iree_aie_bootgen) +endif() + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/runtime/src AMD-AIE) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/experimental AMD-AIE-experimental) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index 6916d3cc9..9fcdb521f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -20,13 +20,19 @@ iree_cc_library( NAME xrt-lite SRCS - api.h allocator.cc allocator.h + api.h buffer.cc buffer.h + command_buffer.cc + command_buffer.h device.cc driver.cc + executable.cc + executable.h + nop_executable_cache.cc + nop_executable_cache.h util.h DEPS iree::base diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 7c41c5f84..25e45939c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -158,12 +158,9 @@ struct iree_hal_xrt_lite_allocator { // types that do such tracking for them. (void)this; - iree_hal_buffer_t* buffer = nullptr; - shim_xcl_bo_flags f = {}; - f.flags = XCL_BO_FLAGS_HOST_ONLY; - f.extension = 0; std::unique_ptr bo = - shim_device->alloc_bo(allocation_size, f); + shim_device->alloc_bo(allocation_size, XCL_BO_FLAGS_HOST_ONLY); + iree_hal_buffer_t* buffer = nullptr; iree_status_t status = iree_hal_xrt_lite_buffer_wrap( std::move(bo), reinterpret_cast(this), compat_params.type, compat_params.access, compat_params.usage, @@ -334,14 +331,14 @@ static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( #define ALLOCATOR_MEMBER_VOID(member) \ MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, member) -ALLOCATOR_MEMBER_STATUS(trim) -ALLOCATOR_MEMBER_VOID(query_statistics) -ALLOCATOR_MEMBER_STATUS(query_memory_heaps) -ALLOCATOR_MEMBER(query_buffer_compatibility, iree_hal_buffer_compatibility_t) -ALLOCATOR_MEMBER_STATUS(allocate_buffer) -ALLOCATOR_MEMBER_VOID(deallocate_buffer) -ALLOCATOR_MEMBER_STATUS(import_buffer) -ALLOCATOR_MEMBER_STATUS(export_buffer) +ALLOCATOR_MEMBER_STATUS(trim); +ALLOCATOR_MEMBER_VOID(query_statistics); +ALLOCATOR_MEMBER_STATUS(query_memory_heaps); +ALLOCATOR_MEMBER(query_buffer_compatibility, iree_hal_buffer_compatibility_t); +ALLOCATOR_MEMBER_STATUS(allocate_buffer); +ALLOCATOR_MEMBER_VOID(deallocate_buffer); +ALLOCATOR_MEMBER_STATUS(import_buffer); +ALLOCATOR_MEMBER_STATUS(export_buffer); namespace { const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index a41e12abf..0b1c62523 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -38,7 +38,7 @@ struct iree_hal_xrt_lite_buffer { // can be used by implementations that have no way of providing host // pointers at a large cost (alloc + device->host transfer on map and // host->device transfer + dealloc on umap). Try not to use that. - void* host_ptr = this->bo->map(shim_xdna::map_type::write); + void* host_ptr = this->bo->map(); IREE_ASSERT(host_ptr != nullptr); // Should be guaranteed by previous checks. uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; @@ -154,10 +154,10 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { #define BUFFER_MEMBER_STATUS(member) \ MEMBER_WRAPPER_STATUS(iree_hal_buffer_t, iree_hal_xrt_lite_buffer, member) -BUFFER_MEMBER_STATUS(map_range) -BUFFER_MEMBER_STATUS(unmap_range) -BUFFER_MEMBER_STATUS(invalidate_range) -BUFFER_MEMBER_STATUS(flush_range) +BUFFER_MEMBER_STATUS(map_range); +BUFFER_MEMBER_STATUS(unmap_range); +BUFFER_MEMBER_STATUS(invalidate_range); +BUFFER_MEMBER_STATUS(flush_range); namespace { const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable = { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc new file mode 100644 index 000000000..59e01fdba --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc @@ -0,0 +1,334 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/command_buffer.h" + +#include "iree-amd-aie/driver/xrt-lite/util.h" + +namespace { +extern const iree_hal_command_buffer_vtable_t + iree_hal_xrt_lite_command_buffer_vtable; +} + +struct iree_hal_xrt_lite_command_buffer { + iree_hal_command_buffer_t base; + iree_allocator_t host_allocator; + + iree_status_t begin() { + // TODO(null): if the implementation needs to route the begin to the + // implementation it can be done here. Note that creation may happen much + // earlier than recording and any expensive work should be deferred until + // this point to make profiling easier. + (void)this; + iree_status_t status = + iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "command buffer recording start not implemented"); + return status; + } + + iree_status_t end() { + // TODO(null): if recording requires multiple passes any fixup/linking can + // happen here. Recording-only resources are no longer needed after this + // point and can be disposed. + (void)this; + iree_status_t status = + iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "command buffer finalization not implemented"); + return status; + } + + void begin_debug_group(iree_string_view_t label, + iree_hal_label_color_t label_color, + const iree_hal_label_location_t* location) { + // TODO(null): begin a nested debug group (push) if the implementation has a + // way to insert markers. This is informational and can be ignored. + (void)this; + } + + void end_debug_group() { + // TODO(null): end a nested debug group (pop). Always called 1:1 in stack + // order with begin_debug_group. + (void)this; + } + + iree_status_t execution_barrier( + iree_hal_execution_stage_t source_stage_mask, + iree_hal_execution_stage_t target_stage_mask, + iree_hal_execution_barrier_flags_t flags, + iree_host_size_t memory_barrier_count, + const iree_hal_memory_barrier_t* memory_barriers, + iree_host_size_t buffer_barrier_count, + const iree_hal_buffer_barrier_t* buffer_barriers) { + // TODO(null): barriers split the execution sequence into all operations + // that did happen before the barrier and all that will happen after. In + // implementations that have no concurrency this can be a no-op. This is + // effectively just a signal_event followed by a wait_event. + (void)this; + iree_status_t status = iree_make_status( + IREE_STATUS_UNIMPLEMENTED, "execution barriers not implemented"); + return status; + } + + iree_status_t signal_event(iree_hal_event_t* event, + iree_hal_execution_stage_t source_stage_mask) { + // TODO(null): WIP API and may change; signals the given event allowing + // waiters to proceed. + (void)this; + iree_status_t status = + iree_make_status(IREE_STATUS_UNIMPLEMENTED, "events not implemented"); + return status; + } + + iree_status_t reset_event(iree_hal_event_t* event, + iree_hal_execution_stage_t source_stage_mask) { + // TODO(null): WIP API and may change; resets the given event to unsignaled. + (void)this; + iree_status_t status = + iree_make_status(IREE_STATUS_UNIMPLEMENTED, "events not implemented"); + return status; + } + + iree_status_t wait_events(iree_host_size_t event_count, + const iree_hal_event_t** events, + iree_hal_execution_stage_t source_stage_mask, + iree_hal_execution_stage_t target_stage_mask, + iree_host_size_t memory_barrier_count, + const iree_hal_memory_barrier_t* memory_barriers, + iree_host_size_t buffer_barrier_count, + const iree_hal_buffer_barrier_t* buffer_barriers) { + // TODO(null): WIP API and may change; waits on the list of events and + // enacts the specified set of barriers. Implementations without + // fine-grained tracking can treat this as an execution_barrier and ignore + // the memory/buffer barriers provided. + (void)this; + iree_status_t status = + iree_make_status(IREE_STATUS_UNIMPLEMENTED, "events not implemented"); + return status; + } + + iree_status_t discard_buffer(iree_hal_buffer_ref_t buffer_ref) { + // TODO(null): WIP API and may change; this is likely to become an + // madvise-like command that can be used to control prefetching and other + // cache behavior. The current discard behavior is a hint that the buffer + // contents will never be used again and that if they are in a cache they + // need not be written back to global memory. + (void)this; + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "discard buffer not implemented"); + return status; + } + + iree_status_t fill_buffer(iree_hal_buffer_ref_t target_ref, + const void* pattern, + iree_host_size_t pattern_length) { + // TODO(null): memset on the buffer. The pattern_length is 1, 2, or 4 bytes. + // Note that the buffer may be a reference to a binding table slot in which + // case it will be provided during submission to a queue. + (void)this; + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "fill buffer not implemented"); + return status; + } + + iree_status_t update_buffer(const void* source_buffer, + iree_host_size_t source_offset, + iree_hal_buffer_ref_t target_ref) { + // TODO(null): embed and copy a small (~64KB) chunk of host memory to the + // target buffer. The source_buffer contents must be captured as they may + // change/be freed after this call completes. + // Note that the target buffer may be a reference to a binding table slot in + // which case it will be provided during submission to a queue. + (void)this; + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "update buffer not implemented"); + + return status; + } + + iree_status_t copy_buffer(iree_hal_buffer_ref_t source_ref, + iree_hal_buffer_ref_t target_ref) { + // TODO(null): memcpy between two buffers. The buffers must both be + // device-visible but may reside on either the host or device. + // Note that either buffer may be a reference to a binding table slot in + // which case it will be provided during submission to a queue. + (void)this; + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "copy buffer not implemented"); + + return status; + } + + iree_status_t collective(iree_hal_channel_t* channel, + iree_hal_collective_op_t op, uint32_t param, + iree_hal_buffer_ref_t send_ref, + iree_hal_buffer_ref_t recv_ref, + iree_device_size_t element_count) { + // TODO(null): perform the collective operation defined by op. See the + // headers for more information. The channel is fixed for a particular + // recording but note that either buffer may be a reference to a binding + // table slot in which case it will be provided during submission to a + // queue. + (void)this; + iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "collectives not implemented"); + + return status; + } + + iree_status_t dispatch(iree_hal_executable_t* executable, int32_t entry_point, + const uint32_t workgroup_count[3], + iree_const_byte_span_t constants, + iree_hal_buffer_ref_list_t bindings, + iree_hal_dispatch_flags_t flags) { + // TODO(null): dispatch the specified executable entry point with the given + // workgroup count. The constants must be copied into the command buffer as + // they may be mutated or freed after this call returns. + // Note that any of the bindings may be references to binding table slots in + // which case they will be provided during submission to a queue. + (void)this; + iree_status_t status = + iree_make_status(IREE_STATUS_UNIMPLEMENTED, "dispatch not implemented"); + + return status; + } + + iree_status_t dispatch_indirect(iree_hal_executable_t* executable, + int32_t entry_point, + iree_hal_buffer_ref_t workgroups_ref, + iree_const_byte_span_t constants, + iree_hal_buffer_ref_list_t bindings, + iree_hal_dispatch_flags_t flags) { + // TODO(null): dispatch the specified executable entry point with a + // workgroup count that is stored in the given workgroup count buffer as a + // uint32_t[3]. The workgroup count may change up until immediately prior to + // the dispatch. The constants must be copied into the command buffer as + // they may be mutated or freed after this call returns. Note that any of + // the bindings may be references to binding table slots in which case they + // will be provided during submission to a queue. + (void)this; + iree_status_t status = iree_make_status( + IREE_STATUS_UNIMPLEMENTED, "indirect dispatch not implemented"); + + return status; + } +}; + +static iree_hal_xrt_lite_command_buffer* iree_hal_xrt_lite_command_buffer_cast( + iree_hal_command_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_command_buffer_vtable); + return (iree_hal_xrt_lite_command_buffer*)base_value; +} + +iree_status_t iree_hal_xrt_lite_command_buffer_create( + iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer) { + IREE_ASSERT_ARGUMENT(out_command_buffer); + *out_command_buffer = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_command_buffer* command_buffer = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, + iree_allocator_malloc(host_allocator, + sizeof(*command_buffer) + + iree_hal_command_buffer_validation_state_size( + mode, binding_capacity), + (void**)&command_buffer)); + iree_hal_command_buffer_initialize( + device_allocator, mode, command_categories, queue_affinity, + binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer), + &iree_hal_xrt_lite_command_buffer_vtable, &command_buffer->base); + command_buffer->host_allocator = host_allocator; + + // TODO(null): allocate any additional resources for managing command buffer + // state. Some implementations may have their own command buffer/command list + // APIs this can route to or may need to implement it all themselves using + // iree_arena_t/block pools. Implementations should also retain any resources + // used during the recording and can use iree_hal_resource_set_t* to make that + // easier. + iree_status_t status = iree_make_status( + IREE_STATUS_UNIMPLEMENTED, "command buffers not yet implemented"); + + if (iree_status_is_ok(status)) { + *out_command_buffer = &command_buffer->base; + } else { + iree_hal_command_buffer_release(&command_buffer->base); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_xrt_lite_command_buffer_destroy( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_xrt_lite_command_buffer* command_buffer = + iree_hal_xrt_lite_command_buffer_cast(base_command_buffer); + iree_allocator_t host_allocator = command_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + // TODO(null): release any implementation resources and + // iree_hal_resource_set_t. + + iree_allocator_free(host_allocator, command_buffer); + + IREE_TRACE_ZONE_END(z0); +} + +bool iree_hal_xrt_lite_command_buffer_isa( + iree_hal_command_buffer_t* command_buffer) { + return iree_hal_resource_is(&command_buffer->resource, + &iree_hal_xrt_lite_command_buffer_vtable); +} + +#define COMMAND_BUFFER_MEMBER(member, return_t) \ + MEMBER_WRAPPER(iree_hal_command_buffer_t, iree_hal_xrt_lite_command_buffer, \ + member, return_t) +#define COMMAND_BUFFER_MEMBER_STATUS(member) \ + MEMBER_WRAPPER_STATUS(iree_hal_command_buffer_t, \ + iree_hal_xrt_lite_command_buffer, member) +#define COMMAND_BUFFER_MEMBER_VOID(member) \ + MEMBER_WRAPPER_VOID(iree_hal_command_buffer_t, \ + iree_hal_xrt_lite_command_buffer, member) + +COMMAND_BUFFER_MEMBER_STATUS(begin); +COMMAND_BUFFER_MEMBER_STATUS(end); +COMMAND_BUFFER_MEMBER_VOID(begin_debug_group); +COMMAND_BUFFER_MEMBER_VOID(end_debug_group); +COMMAND_BUFFER_MEMBER_STATUS(execution_barrier); +COMMAND_BUFFER_MEMBER_STATUS(signal_event); +COMMAND_BUFFER_MEMBER_STATUS(reset_event); +COMMAND_BUFFER_MEMBER_STATUS(wait_events); +COMMAND_BUFFER_MEMBER_STATUS(discard_buffer); +COMMAND_BUFFER_MEMBER_STATUS(fill_buffer); +COMMAND_BUFFER_MEMBER_STATUS(update_buffer); +COMMAND_BUFFER_MEMBER_STATUS(copy_buffer); +COMMAND_BUFFER_MEMBER_STATUS(collective); +COMMAND_BUFFER_MEMBER_STATUS(dispatch); +COMMAND_BUFFER_MEMBER_STATUS(dispatch_indirect); + +namespace { +const iree_hal_command_buffer_vtable_t iree_hal_xrt_lite_command_buffer_vtable = + { + .destroy = iree_hal_xrt_lite_command_buffer_destroy, + .begin = iree_hal_xrt_lite_command_buffer_begin, + .end = iree_hal_xrt_lite_command_buffer_end, + .begin_debug_group = iree_hal_xrt_lite_command_buffer_begin_debug_group, + .end_debug_group = iree_hal_xrt_lite_command_buffer_end_debug_group, + .execution_barrier = iree_hal_xrt_lite_command_buffer_execution_barrier, + .signal_event = iree_hal_xrt_lite_command_buffer_signal_event, + .reset_event = iree_hal_xrt_lite_command_buffer_reset_event, + .wait_events = iree_hal_xrt_lite_command_buffer_wait_events, + .discard_buffer = iree_hal_xrt_lite_command_buffer_discard_buffer, + .fill_buffer = iree_hal_xrt_lite_command_buffer_fill_buffer, + .update_buffer = iree_hal_xrt_lite_command_buffer_update_buffer, + .copy_buffer = iree_hal_xrt_lite_command_buffer_copy_buffer, + .collective = iree_hal_xrt_lite_command_buffer_collective, + .dispatch = iree_hal_xrt_lite_command_buffer_dispatch, + .dispatch_indirect = iree_hal_xrt_lite_command_buffer_dispatch_indirect, +}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h new file mode 100644 index 000000000..7283582bf --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h @@ -0,0 +1,25 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_XRT_LITE_COMMAND_BUFFER_H_ +#define IREE_HAL_DRIVERS_XRT_LITE_COMMAND_BUFFER_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +// Creates {Null} command buffer. +iree_status_t iree_hal_xrt_lite_command_buffer_create( + iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer); + +// Returns true if |command_buffer| is a {Null} command buffer. +bool iree_hal_xrt_lite_command_buffer_isa( + iree_hal_command_buffer_t* command_buffer); + +#endif // IREE_HAL_DRIVERS_XRT_LITE_COMMAND_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index 4bd029265..8ed1891b0 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -22,72 +22,88 @@ iree_hal_cts_test_suite( INCLUDED_TESTS "allocator" "buffer_mapping" -# "command_buffer" "driver" ) -#set(PEANO_INSTALL_DIR "" CACHE PATH "") -#set(VITIS_DIR "" CACHE PATH "") -#if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) -# message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") -#endif() -#cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") -#set(TARGET_DEVICE "npu1_4col" CACHE STRING "") -# -#iree_bytecode_module( -# NAME -# xrt_lite_executable_cache_test_module -# MODULE_FILE_NAME -# xrt_lite_executable_cache_test.bin -# SRC -# "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" -# FLAGS -# --compile-mode=hal-executable -# --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} -# --iree-hal-target-backends=amd-aie -# --iree-amdaie-lower-to-aie-pipeline=air -# --iree-amdaie-target-device=${TARGET_DEVICE} -# --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} -# --iree-amd-aie-vitis-install-dir=${VITIS_DIR} -# --iree-amd-aie-enable-chess=$ -# --iree-amd-aie-show-invoked-commands -# PUBLIC -# TESTONLY -#) -# -#iree_c_embed_data( -# NAME -# xrt_lite_executables_c -# SRCS -# xrt_lite_executable_cache_test.bin -# C_FILE_OUTPUT -# xrt_lite_executables_c.c -# H_FILE_OUTPUT -# xrt_lite_executables_c.h -# IDENTIFIER -# iree_cts_testdata_executables_aie_xrt_lite -# STRIP_PREFIX -# xrt_lite_ -# DEPENDS -# ::xrt_lite_executable_cache_test_module -# FLATTEN -# PUBLIC -# TESTONLY -#) -# -#iree_cc_test( -# NAME -# xrt_lite_command_buffer_dispatch_test -# SRCS -# xrt_lite_command_buffer_dispatch_test.cc -# DEPS -# ::xrt_lite_executables_c -# iree-amd-aie::driver::xrt-lite::registration -# iree::base -# iree::hal -# iree::hal::cts::cts_test_base -# iree::testing::gtest_main -# iree::tools::testing::e2e::e2e_test_util -#) -# -#target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_command_buffer_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") \ No newline at end of file +set(PEANO_INSTALL_DIR "" CACHE PATH "") +set(VITIS_DIR "" CACHE PATH "") +if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) + message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") +endif() +cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") +set(TARGET_DEVICE "npu1_4col" CACHE STRING "") + +iree_bytecode_module( + NAME + xrt_lite_executable_cache_test_module + MODULE_FILE_NAME + xrt_lite_executable_cache_test.bin + SRC + "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" + FLAGS + --compile-mode=hal-executable + --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} + --iree-hal-target-backends=amd-aie + --iree-amdaie-lower-to-aie-pipeline=air + --iree-amdaie-target-device=${TARGET_DEVICE} + --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} + --iree-amd-aie-vitis-install-dir=${VITIS_DIR} + --iree-amd-aie-enable-chess=$ + --iree-amd-aie-show-invoked-commands + --iree-hal-memoization=false + --iree-hal-indirect-command-buffers=false + PUBLIC + TESTONLY +) + +iree_c_embed_data( + NAME + xrt_lite_executables_c + SRCS + xrt_lite_executable_cache_test.bin + C_FILE_OUTPUT + xrt_lite_executables_c.c + H_FILE_OUTPUT + xrt_lite_executables_c.h + IDENTIFIER + iree_cts_testdata_executables_aie_xrt_lite + STRIP_PREFIX + xrt_lite_ + DEPENDS + ::xrt_lite_executable_cache_test_module + FLATTEN + PUBLIC + TESTONLY +) + +iree_cc_test( + NAME + xrt_lite_executable_cache_test + SRCS + executable_cache_test.cc + DEPS + ::xrt_lite_executables_c + iree-amd-aie::driver::xrt-lite::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main +) + +iree_cc_test( + NAME + xrt_lite_command_buffer_dispatch_test + SRCS + command_buffer_dispatch_test.cc + DEPS + ::xrt_lite_executables_c + iree-amd-aie::driver::xrt-lite::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main + iree::tools::testing::e2e::e2e_test_util +) + +target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_executable_cache_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") +target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_command_buffer_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/command_buffer_dispatch_test.cc similarity index 79% rename from runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc rename to runtime/src/iree-amd-aie/driver/xrt-lite/cts/command_buffer_dispatch_test.cc index fbe4c0720..00053e145 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/xrt_lite_command_buffer_dispatch_test.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/command_buffer_dispatch_test.cc @@ -31,6 +31,7 @@ iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { const auto& file = toc[0]; return iree_make_const_byte_span(file.data, file.size); } + class CommandBufferDispatchTest : public CTSTestBase<::testing::TestWithParam> { protected: @@ -74,6 +75,48 @@ int32_t generate_random_number(iree_hal_element_type_t element_type, min; } +TEST_F(CommandBufferDispatchTest, Create) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + EXPECT_TRUE((iree_hal_command_buffer_allowed_categories(command_buffer) & + IREE_HAL_COMMAND_CATEGORY_DISPATCH) == + IREE_HAL_COMMAND_CATEGORY_DISPATCH); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(CommandBufferDispatchTest, BeginEnd) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(CommandBufferDispatchTest, SubmitEmpty) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + TEST_P(CommandBufferDispatchTest, DispatchMatmul) { PrepareMatmulExecutable(); @@ -174,7 +217,7 @@ TEST_P(CommandBufferDispatchTest, DispatchMatmul) { CleanupExecutable(); } -INSTANTIATE_TEST_SUITE_P(CommandBufferTest, CommandBufferDispatchTest, +INSTANTIATE_TEST_SUITE_P(CommandBufferDispatchTest, CommandBufferDispatchTest, ::testing::Values(RecordingType::kDirect), GenerateTestName()); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc new file mode 100644 index 000000000..0904d33a6 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc @@ -0,0 +1,85 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "xrt_lite_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt-lite"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_lite_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_lite_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class ExecutableCacheTest : public CTSTestBase<> {}; + +TEST_F(ExecutableCacheTest, Create) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, CantPrepareUnknownFormat) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + EXPECT_FALSE(iree_hal_executable_cache_can_prepare_format( + executable_cache, /*caching_mode=*/0, iree_make_cstring_view("FOO?"))); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, PrepareExecutable) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("executable_cache_test.bin")); + + iree_hal_executable_t* executable = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache, &executable_params, &executable)); + + iree_hal_executable_release(executable); + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 9b3f77d75..ab3dbca70 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -8,15 +8,38 @@ #include "iree-amd-aie/driver/xrt-lite/allocator.h" #include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/command_buffer.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" +#include "nop_executable_cache.h" -struct iree_hal_xrt_lite_device_t { +struct iree_hal_xrt_lite_device { iree_hal_resource_t resource; iree_string_view_t identifier; iree_allocator_t host_allocator; // not used iree_hal_allocator_t* device_allocator; std::shared_ptr shim_device; + + iree_status_t create_executable_cache( + iree_string_view_t identifier, iree_loop_t loop, + iree_hal_executable_cache_t** out_executable_cache) { + return iree_hal_xrt_lite_nop_executable_cache_create( + shim_device, identifier, host_allocator, out_executable_cache); + } + + iree_status_t create_command_buffer( + iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, + iree_host_size_t binding_capacity, + iree_hal_command_buffer_t** out_command_buffer) { + // TODO(null): pass any additional resources required to create the command + // buffer. The implementation could pool command buffers here. + return iree_hal_xrt_lite_command_buffer_create( + device_allocator, mode, command_categories, queue_affinity, + binding_capacity, host_allocator, out_command_buffer); + } }; namespace { @@ -41,7 +64,7 @@ iree_status_t iree_hal_xrt_lite_device_create( *out_device = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device_t* device = nullptr; + iree_hal_xrt_lite_device* device = nullptr; iree_host_size_t total_size = sizeof(*device) + identifier.size; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, total_size, @@ -69,22 +92,20 @@ iree_status_t iree_hal_xrt_lite_device_create( return status; } -static iree_hal_xrt_lite_device_t* iree_hal_xrt_lite_device_cast( +static iree_hal_xrt_lite_device* iree_hal_xrt_lite_device_cast( iree_hal_device_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_device_vtable); - return reinterpret_cast(base_value); + return reinterpret_cast(base_value); } static iree_string_view_t iree_hal_xrt_lite_device_id( iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device_t* device = - iree_hal_xrt_lite_device_cast(base_device); + iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); return device->identifier; } static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device_t* device = - iree_hal_xrt_lite_device_cast(base_device); + iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device); IREE_TRACE_ZONE_BEGIN(z0); @@ -103,23 +124,32 @@ static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device_t* device = - iree_hal_xrt_lite_device_cast(base_device); + iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); return device->host_allocator; } static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device_t* device = - iree_hal_xrt_lite_device_cast(base_device); + iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); return device->device_allocator; } +#define DEVICE_MEMBER(member, return_t) \ + MEMBER_WRAPPER(iree_hal_device_t, iree_hal_xrt_lite_device, member, return_t) +#define DEVICE_MEMBER_STATUS(member) \ + MEMBER_WRAPPER_STATUS(iree_hal_device_t, iree_hal_xrt_lite_device, member) +#define DEVICE_MEMBER_VOID(member) \ + MEMBER_WRAPPER_VOID(iree_hal_device_t, iree_hal_xrt_lite_device, member) + +DEVICE_MEMBER_STATUS(create_executable_cache); +DEVICE_MEMBER_STATUS(create_command_buffer); + namespace { const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { .destroy = iree_hal_xrt_lite_device_destroy, .id = iree_hal_xrt_lite_device_id, .host_allocator = iree_hal_xrt_lite_device_host_allocator, .device_allocator = iree_hal_xrt_lite_device_device_allocator, -}; + .create_executable_cache = iree_hal_xrt_lite_device_create_executable_cache, + .create_command_buffer = iree_hal_xrt_lite_device_create_command_buffer}; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc new file mode 100644 index 000000000..02180a879 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -0,0 +1,273 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/executable.h" + +#include + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h" +#include "iree-amd-aie/schemas/xrt_executable_def_reader.h" +#include "iree-amd-aie/schemas/xrt_executable_def_verifier.h" +#include "iree/base/api.h" + +#define MAX_EXEC_BO_SIZE (4096) + +struct iree_hal_xrt_lite_native_executable_t { + // Abstract resource used for injecting reference counting and vtable; must be + // at offset 0. + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + iree_host_size_t entry_point_count; + iree_hal_xrt_lite_kernel_params_t entry_points[16]; +}; + +namespace { +extern const iree_hal_executable_vtable_t + iree_hal_xrt_lite_native_executable_vtable; +} // namespace + +static iree_hal_xrt_lite_native_executable_t* +iree_hal_xrt_lite_native_executable_cast(iree_hal_executable_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); + return (iree_hal_xrt_lite_native_executable_t*)base_value; +} + +// Verifies the structure of the flatbuffer so that we can avoid doing so during +// runtime. +// +// There are still some conditions we must be aware of (such as omitted names on +// functions with internal linkage), however we shouldn't need to bounds check +// anything within the flatbuffer after this succeeds. +static iree_status_t +iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( + iree_const_byte_span_t flatbuffer_data) { + if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "flatbuffer data is not present or less than 16 bytes (%zu total)", + flatbuffer_data.data_length); + } + + // Run flatcc generated verification. This ensures all pointers are in-bounds + // and that we can safely walk the file, but not that the actual contents of + // the flatbuffer meet our expectations. + int verify_ret = iree_amd_aie_hal_xrt_ExecutableDef_verify_as_root( + flatbuffer_data.data, flatbuffer_data.data_length); + if (verify_ret != flatcc_verify_ok) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "flatbuffer verification failed: %s", + flatcc_verify_error_string(verify_ret)); + } + + iree_amd_aie_hal_xrt_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_ExecutableDef_as_root(flatbuffer_data.data); + + flatbuffers_string_vec_t entry_points_vec = + iree_amd_aie_hal_xrt_ExecutableDef_entry_points_get(executable_def); + size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); + if (entry_point_count == 0) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "no entry points found in the executable"); + } + for (size_t i = 0; i < entry_point_count; ++i) { + if (!flatbuffers_string_len( + flatbuffers_string_vec_at(entry_points_vec, i))) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "executable entry point %zu has no name", i); + } + } + + iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins = + iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def); + size_t number_xclbin = iree_amd_aie_hal_xrt_XclbinDef_vec_len(xclbins); + if (number_xclbin == 0) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no xclbin present"); + } + + iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instr = + iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def); + size_t number_asm_instr = iree_amd_aie_hal_xrt_AsmInstDef_vec_len(asm_instr); + if (number_asm_instr != entry_point_count) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "number of entry points (%zu) and number of asm " + "instructions (%zu) mismatched", + entry_point_count, number_asm_instr); + } + + return iree_ok_status(); +} + +iree_status_t iree_hal_xrt_lite_native_executable_create( + std::shared_ptr shim_device, + const iree_hal_executable_params_t* executable_params, + iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) { + IREE_ASSERT_ARGUMENT(executable_params); + IREE_ASSERT_ARGUMENT(out_executable); + IREE_TRACE_ZONE_BEGIN(z0); + + *out_executable = nullptr; + iree_hal_xrt_lite_native_executable_t* executable = nullptr; + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( + executable_params->executable_data)); + + iree_amd_aie_hal_xrt_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_ExecutableDef_as_root( + executable_params->executable_data.data); + flatbuffers_uint32_vec_t xclbin_indices_vec = + iree_amd_aie_hal_xrt_ExecutableDef_xclbin_indices_get(executable_def); + flatbuffers_uint32_vec_t asm_instr_indices_vec = + iree_amd_aie_hal_xrt_ExecutableDef_asm_instr_indices_get(executable_def); + flatbuffers_string_vec_t entry_points_vec = + iree_amd_aie_hal_xrt_ExecutableDef_entry_points_get(executable_def); + iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins_vec = + iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def); + iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instrs_vec = + iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def); + iree_host_size_t entry_point_count = + flatbuffers_string_vec_len(entry_points_vec); + + // Calculate the total number of characters across all entry point names. This + // is only required when tracing so that we can store copies of the names as + // the flatbuffer storing the strings may be released while the executable is + // still live. + iree_host_size_t total_entry_point_name_chars = 0; + IREE_TRACE({ + for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; + entry_ordinal++) { + const char* entry_name = + flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); + total_entry_point_name_chars += flatbuffers_string_len(entry_name); + } + }); + + iree_host_size_t total_size = + sizeof(*executable) + + entry_point_count * sizeof(executable->entry_points[0]) + + total_entry_point_name_chars; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, + iree_allocator_malloc(host_allocator, total_size, (void**)&executable)); + IREE_TRACE( + char* string_table_buffer = + (char*)((char*)executable + sizeof(*executable) + + entry_point_count * sizeof(executable->entry_points[0]))); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_native_executable_vtable, + &executable->resource); + executable->host_allocator = host_allocator; + executable->entry_point_count = entry_point_count; + for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; + entry_ordinal++) { + const char* entry_name = + flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); + uint32_t xclbin_index = + flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); + iree_amd_aie_hal_xrt_XclbinDef_table_t xclbin_def = + iree_amd_aie_hal_xrt_XclbinDef_vec_at(xclbins_vec, xclbin_index); + flatbuffers_string_t xclbin_fb = + iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def); + + iree_hal_xrt_lite_kernel_params_t* params = + &executable->entry_points[entry_ordinal]; + + // XRT API needs this vector and cant actually read a void*. + std::vector xclbinVector( + xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); + xrt::xclbin xclbin = xrt::xclbin(xclbinVector); + std::unique_ptr hw_ctx = + shim_device->create_hw_context(xclbin); + + uint32_t asm_instr_index = + flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); + iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def = + iree_amd_aie_hal_xrt_AsmInstDef_vec_at(asm_instrs_vec, asm_instr_index); + flatbuffers_uint32_vec_t asm_inst = + iree_amd_aie_hal_xrt_AsmInstDef_asm_inst_get(asminst_def); + uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); + + size_t ctrl_code_size = num_instr * sizeof(uint32_t); + params->bo_ctrl_code = + shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); + params->bo_exec_buf = + shim_device->alloc_bo(MAX_EXEC_BO_SIZE, XCL_BO_FLAGS_EXECBUF); + + uint32_t* instr_buffer = + static_cast(params->bo_ctrl_code->map()); + memcpy(instr_buffer, asm_inst, ctrl_code_size); + params->num_instr = num_instr; + + // Stash the entry point name in the string table for use when tracing. + IREE_TRACE({ + iree_host_size_t entry_name_length = flatbuffers_string_len(entry_name); + memcpy(string_table_buffer, entry_name, entry_name_length); + params->kernel_name = + iree_make_string_view(string_table_buffer, entry_name_length); + string_table_buffer += entry_name_length; + }); + + IREE_TRACE({ + if (iree_amd_aie_hal_xrt_ExecutableDef_source_locations_is_present( + executable_def)) { + iree_amd_aie_hal_xrt_FileLineLocDef_vec_t source_locs_vec = + iree_amd_aie_hal_xrt_ExecutableDef_source_locations_get( + executable_def); + iree_amd_aie_hal_xrt_FileLineLocDef_table_t source_loc = + iree_amd_aie_hal_xrt_FileLineLocDef_vec_at(source_locs_vec, + entry_ordinal); + flatbuffers_string_t filename = + iree_amd_aie_hal_xrt_FileLineLocDef_filename_get(source_loc); + uint32_t line = + iree_amd_aie_hal_xrt_FileLineLocDef_line_get(source_loc); + params->source_filename = + iree_make_string_view(filename, flatbuffers_string_len(filename)); + params->source_line = line; + } + }); + } + + *out_executable = (iree_hal_executable_t*)executable; + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_native_executable_destroy( + iree_hal_executable_t* base_executable) { + iree_hal_xrt_lite_native_executable_t* executable = + iree_hal_xrt_lite_native_executable_cast(base_executable); + iree_allocator_t host_allocator = executable->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(host_allocator, executable); + + IREE_TRACE_ZONE_END(z0); +} + +iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( + iree_hal_executable_t* base_executable, int32_t entry_point, + iree_hal_xrt_lite_kernel_params_t* out_params) { + iree_hal_xrt_lite_native_executable_t* executable = + iree_hal_xrt_lite_native_executable_cast(base_executable); + if (entry_point >= executable->entry_point_count) { + return iree_make_status(IREE_STATUS_OUT_OF_RANGE, + "entry point ordinal %d out of range; executable " + "only contains %" PRIhsz " entry points", + entry_point, executable->entry_point_count); + } + + memcpy(out_params, &executable->entry_points[entry_point], + sizeof(*out_params)); + return iree_ok_status(); +} + +namespace { +const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable = + { + /*.destroy=*/iree_hal_xrt_lite_native_executable_destroy, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h new file mode 100644 index 000000000..7310a103b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -0,0 +1,51 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ + +#include + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h" +#include "iree/base/api.h" +#include "iree/base/tracing.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Object and launch parameters for a compute kernel. +struct iree_hal_xrt_lite_kernel_params_t { + std::unique_ptr context; + std::unique_ptr bo_ctrl_code; + std::unique_ptr bo_exec_buf; + // Number of assembly instructions argument to the kernel + uint32_t num_instr; // number of instructions + IREE_TRACE(iree_string_view_t kernel_name;) + IREE_TRACE(iree_string_view_t source_filename;) + IREE_TRACE(uint32_t source_line;) +}; + +// |out_executable| must be released by the caller (see +// iree_hal_executable_release). +iree_status_t iree_hal_xrt_lite_native_executable_create( + std::shared_ptr shim_device, + const iree_hal_executable_params_t* executable_params, + iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); + +// Returns the kernel launch parameters for the given |entry_point|. +iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( + iree_hal_executable_t* executable, int32_t entry_point, + iree_hal_xrt_lite_kernel_params_t* out_params); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc new file mode 100644 index 000000000..2753eebb7 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -0,0 +1,98 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" + +#include + +#include "iree-amd-aie/driver/xrt-lite/executable.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +struct iree_hal_xrt_lite_nop_executable_cache_t { + // Abstract resource used for injecting reference counting and vtable; must be + // at offset 0. + iree_hal_resource_t resource; + std::shared_ptr shim_device; + iree_allocator_t host_allocator; +}; + +namespace { +extern const iree_hal_executable_cache_vtable_t + iree_hal_xrt_lite_nop_executable_cache_vtable; +} // namespace + +static iree_hal_xrt_lite_nop_executable_cache_t* +iree_hal_xrt_lite_nop_executable_cache_cast( + iree_hal_executable_cache_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, + &iree_hal_xrt_lite_nop_executable_cache_vtable); + return (iree_hal_xrt_lite_nop_executable_cache_t*)base_value; +} + +iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( + std::shared_ptr shim_device, + iree_string_view_t identifier, iree_allocator_t host_allocator, + iree_hal_executable_cache_t** out_executable_cache) { + IREE_ASSERT_ARGUMENT(out_executable_cache); + *out_executable_cache = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_nop_executable_cache_t* executable_cache = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*executable_cache), + (void**)&executable_cache)); + iree_hal_resource_initialize(&iree_hal_xrt_lite_nop_executable_cache_vtable, + &executable_cache->resource); + executable_cache->host_allocator = host_allocator; + executable_cache->shim_device = shim_device; + + *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache; + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_nop_executable_cache_destroy( + iree_hal_executable_cache_t* base_executable_cache) { + iree_hal_xrt_lite_nop_executable_cache_t* executable_cache = + iree_hal_xrt_lite_nop_executable_cache_cast(base_executable_cache); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(executable_cache->host_allocator, executable_cache); + + IREE_TRACE_ZONE_END(z0); +} + +static bool iree_hal_xrt_lite_nop_executable_cache_can_prepare_format( + iree_hal_executable_cache_t* base_executable_cache, + iree_hal_executable_caching_mode_t caching_mode, + iree_string_view_t executable_format) { + return iree_string_view_equal(executable_format, + iree_make_cstring_view("XRT")); +} + +static iree_status_t iree_hal_xrt_lite_nop_executable_cache_prepare_executable( + iree_hal_executable_cache_t* base_executable_cache, + const iree_hal_executable_params_t* executable_params, + iree_hal_executable_t** out_executable) { + iree_hal_xrt_lite_nop_executable_cache_t* executable_cache = + iree_hal_xrt_lite_nop_executable_cache_cast(base_executable_cache); + return iree_hal_xrt_lite_native_executable_create( + executable_cache->shim_device, executable_params, + executable_cache->host_allocator, out_executable); +} + +namespace { +const iree_hal_executable_cache_vtable_t + iree_hal_xrt_lite_nop_executable_cache_vtable = { + /*.destroy = */ iree_hal_xrt_lite_nop_executable_cache_destroy, + /*.can_prepare_format = */ + iree_hal_xrt_lite_nop_executable_cache_can_prepare_format, + /*.prepare_executable = */ + iree_hal_xrt_lite_nop_executable_cache_prepare_executable, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h new file mode 100644 index 000000000..8b0ed658e --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -0,0 +1,33 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Creates a no-op executable cache that does not cache at all. +// This is useful to isolate pipeline caching behavior and verify compilation +// behavior. +// +// |out_executable_cache| must be released by the caller (see +// iree_hal_executable_cache_release). +iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( + std::shared_ptr shim_device, + iree_string_view_t identifier, iree_allocator_t host_allocator, + iree_hal_executable_cache_t** out_executable_cache); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index 6adf21546..cc349197c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -8,6 +8,9 @@ #include #include +#include +#include + #include "shim_debug.h" #include "xrt_mem.h" @@ -358,13 +361,9 @@ properties bo::get_properties() const { return {m_flags, m_aligned_size, get_paddr(), get_drm_bo_handle()}; } -void *bo::map(map_type type) const { - if (type != map_type::write) - shim_err( - EINVAL, - "Not support map BO as readonly. Type must be bo::map_type::write"); - return m_aligned; -} +size_t bo::size() { return get_properties().size; } + +void *bo::map() const { return m_aligned; } void bo::unmap(void *addr) {} @@ -431,8 +430,7 @@ void bo::sync(direction dir, size_t size, size_t offset) { } } -void bo::bind_at(size_t pos, const bo *bh, size_t offset, size_t size) { - auto boh = reinterpret_cast(bh); +void bo::bind_at(size_t pos, const bo &boh, size_t offset, size_t size) { std::lock_guard lg(m_args_map_lock); if (m_type != AMDXDNA_BO_CMD) @@ -440,8 +438,8 @@ void bo::bind_at(size_t pos, const bo *bh, size_t offset, size_t size) { if (!pos) m_args_map.clear(); - if (boh->get_type() != AMDXDNA_BO_CMD) { - auto h = boh->get_drm_bo_handle(); + if (boh.get_type() != AMDXDNA_BO_CMD) { + auto h = boh.get_drm_bo_handle(); m_args_map[pos] = h; shim_debug("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); } else { @@ -449,7 +447,7 @@ void bo::bind_at(size_t pos, const bo *bh, size_t offset, size_t size) { const size_t max_args = 1 << max_args_order; size_t key = pos << max_args_order; uint32_t hs[max_args]; - auto arg_cnt = boh->get_arg_bo_handles(hs, max_args); + auto arg_cnt = boh.get_arg_bo_handles(hs, max_args); std::string bohs; for (int i = 0; i < arg_cnt; i++) { m_args_map[key + i] = hs[i]; @@ -473,4 +471,107 @@ uint32_t bo::get_arg_bo_handles(uint32_t *handles, size_t num) const { return sz; } +exec_buf::exec_buf(bo &bo_execbuf, uint32_t op) + : m_exec_buf_bo(bo_execbuf), + m_cmd_pkt(reinterpret_cast(bo_execbuf.map())), + m_cmd_size(bo_execbuf.size()), + m_op(op), + m_arg_cnt(0), + m_reg_idx(0) { + std::memset(m_cmd_pkt, 0, m_cmd_size); + m_cmd_pkt->state = ERT_CMD_STATE_NEW; + m_cmd_pkt->opcode = m_op; + m_cmd_pkt->type = ERT_CU; + // One word for cu mask + inc_pkt_count(sizeof(int32_t)); +} + +void exec_buf::set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx) { + ert_start_kernel_cmd *cmd_pkt = + reinterpret_cast(bo_execbuf.map()); + cmd_pkt->cu_mask = 0x1 << cu_idx.index; +} + +void exec_buf::set_cu_idx(cuidx_t cu_idx) { + m_cmd_pkt->cu_mask = 0x1 << cu_idx.index; +} + +void exec_buf::add_ctrl_bo(bo &bo_ctrl) { + ert_start_kernel_cmd *cmd_packet = + reinterpret_cast(m_exec_buf_bo.map()); + switch (m_op) { + case ERT_START_CU: + break; + case ERT_START_NPU: { + ert_npu_data *npu_data = get_ert_npu_data(cmd_packet); + npu_data->instruction_buffer = bo_ctrl.get_paddr(); + npu_data->instruction_buffer_size = bo_ctrl.size(); + npu_data->instruction_prop_count = 0; + inc_pkt_count(sizeof(*npu_data)); + break; + } + case ERT_START_DPU: { + ert_dpu_data *dpu_data = get_ert_dpu_data(cmd_packet); + dpu_data->instruction_buffer = bo_ctrl.get_paddr(); + dpu_data->instruction_buffer_size = bo_ctrl.size(); + dpu_data->chained = 0; + inc_pkt_count(sizeof(*dpu_data)); + break; + } + default: + throw std::runtime_error("Unknown exec buf op code: " + + std::to_string(m_op)); + } +} + +void exec_buf::add_arg_32(uint32_t val) { + inc_pkt_count(sizeof(val)); + auto args = get_ert_regmap_begin(m_cmd_pkt); + args[m_reg_idx++] = val; + m_arg_cnt++; +} + +void exec_buf::add_arg_64(uint64_t val) { + inc_pkt_count(sizeof(val)); + auto args = get_ert_regmap_begin(m_cmd_pkt); + args[m_reg_idx++] = val; + args[m_reg_idx++] = val >> 32; + m_arg_cnt++; +} + +void exec_buf::add_arg_bo(bo &bo_arg, std::string arg_name) { + // Add to argument list for driver + m_exec_buf_bo.bind_at(m_arg_cnt, bo_arg, 0, bo_arg.size()); + // Add to argument list for control code patching + if (arg_name.empty()) + m_patching_args.emplace_back(std::to_string(m_arg_cnt), bo_arg.get_paddr()); + else + m_patching_args.emplace_back(arg_name, bo_arg.get_paddr()); + // Only increase m_arg_cnt now after it's used by code above. + add_arg_64(bo_arg.get_paddr()); +} + +void exec_buf::dump() { + std::cout << "Dumping exec buf:"; + int *data = static_cast(m_exec_buf_bo.map()); + std::cout << std::hex; + for (int i = 0; i < m_cmd_pkt->count + 1; i++) { + if (i % 4 == 0) std::cout << "\n"; + std::cout << std::setfill('0') << std::setw(8) << data[i] << " "; + } + std::cout << std::setfill(' ') << std::setw(0) << std::dec << std::endl; + + std::cout << "Dumping patching arguement list:\n"; + for (auto &[arg_name, arg_addr] : m_patching_args) + std::cout << "{ " << arg_name << ", 0x" << std::hex << arg_addr << std::dec + << " }\n"; +} + +void exec_buf::inc_pkt_count(uint32_t n) { + m_cmd_pkt->count += n / sizeof(int32_t); + if (m_cmd_size < + sizeof(m_cmd_pkt->header) + m_cmd_pkt->count * sizeof(int32_t)) + throw std::runtime_error("Size of exec buf too small: " + + std::to_string(m_cmd_size)); +} } // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h index f48849845..617e9335a 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -8,21 +8,14 @@ #include "amdxdna_accel.h" #include "device.h" +#include "ert.h" #include "hwctx.h" namespace shim_xdna { -#define XRT_BO_USE_NORMAL 0 -#define XRT_BO_USE_DEBUG 1 - -// map_type - determines how a buffer is mapped -enum class map_type { read, write }; - enum xclBOSyncDirection { XCL_BO_SYNC_BO_TO_DEVICE = 0, XCL_BO_SYNC_BO_FROM_DEVICE, - XCL_BO_SYNC_BO_GMIO_TO_AIE, - XCL_BO_SYNC_BO_AIE_TO_GMIO, }; // direction - direction of sync operation @@ -80,10 +73,12 @@ struct bo { bo(const pdev &p, size_t size, amdxdna_bo_type type); ~bo(); - void *map(map_type) const; + void *map() const; void unmap(void *addr); void sync(direction, size_t size, size_t offset); properties get_properties() const; + size_t size(); + std::unique_ptr share() const; // For cmd BO only void set_cmd_id(uint64_t id); @@ -92,7 +87,7 @@ struct bo { uint32_t get_drm_bo_handle() const; amdxdna_bo_type get_type() const; // DRM BO managed by driver. - void bind_at(size_t pos, const bo *bh, size_t offset, size_t size); + void bind_at(size_t pos, const bo &bh, size_t offset, size_t size); std::string describe() const; // Import DRM BO from m_import shared object void import_bo(); @@ -108,6 +103,28 @@ struct bo { uint32_t get_arg_bo_handles(uint32_t *handles, size_t num) const; }; +struct exec_buf { + bo &m_exec_buf_bo; + ert_start_kernel_cmd *m_cmd_pkt; + size_t m_cmd_size; + uint32_t m_op; + uint32_t m_arg_cnt; + uint32_t m_reg_idx; + std::vector > m_patching_args; + + exec_buf(bo &bo_execbuf, uint32_t op); + static void set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx); + void set_cu_idx(cuidx_t cu_idx); + void add_ctrl_bo(bo &bo_ctrl); + void add_arg_32(uint32_t val); + void add_arg_64(uint64_t val); + void add_arg_bo(bo &bo_arg, std::string arg_name = ""); + void dump(); + static size_t get_ctrl_code_size(const std::string &elf_path); + void patch_ctrl_code(bo &bo_ctrl, const std::string &elf_path); + void inc_pkt_count(uint32_t n); +}; + } // namespace shim_xdna #endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index eeaa6eedc..f373e9b77 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -148,15 +148,14 @@ device::~device() { shim_debug("Destroying KMQ device"); } const pdev &device::get_pdev() const { return m_pdev; } -xrt::xclbin device::get_xclbin(const xrt::uuid &xclbin_id) const { - // Allow access to xclbin in process of loading via device::load_xclbin - if (xclbin_id && xclbin_id == m_xclbin.get_uuid()) return m_xclbin; - throw std::runtime_error("TODO(max):multi-xclbin"); +std::unique_ptr device::create_hw_context( + const xrt::xclbin &xclbin, const std::map &qos) { + return std::make_unique(*this, xclbin, qos); } -std::unique_ptr device::create_hw_context( - const xrt::uuid &xclbin_uuid, const std::map &qos) { - return std::make_unique(*this, get_xclbin(xclbin_uuid), qos); +std::unique_ptr device::create_hw_context(const xrt::xclbin &xclbin) { + const std::map qos{}; + return std::make_unique(*this, xclbin, qos); } std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, @@ -168,6 +167,12 @@ std::unique_ptr device::alloc_bo(size_t size, shim_xcl_bo_flags flags) { return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); } +std::unique_ptr device::alloc_bo(size_t size, uint32_t flags) { + shim_xcl_bo_flags f{}; + f.flags = flags; + return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, f); +} + std::unique_ptr device::import_bo(pid_t pid, int ehdl) { return import_bo(import_fd(pid, ehdl)); } @@ -180,11 +185,6 @@ std::unique_ptr device::import_fence(pid_t pid, int ehdl) { return std::make_unique(*this, import_fd(pid, ehdl)); } -void device::record_xclbin(const xrt::xclbin &xclbin) { - std::lock_guard lk(m_mutex); - m_xclbin = xclbin; -} - std::unique_ptr device::import_bo(int ehdl) const { return std::make_unique(this->m_pdev, ehdl); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index bad30ba4f..23ffd3f27 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -30,25 +30,25 @@ struct pdev { struct device { enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; - xrt::xclbin m_xclbin; mutable std::mutex m_mutex; pdev m_pdev; device(); ~device(); - xrt::xclbin get_xclbin(const xrt::uuid &xclbin_id) const; - std::unique_ptr import_bo(int ehdl) const; const pdev &get_pdev() const; std::unique_ptr alloc_bo(uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags); + std::unique_ptr alloc_bo(size_t size, uint32_t flags); std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); std::unique_ptr create_hw_context( - const xrt::uuid &xclbin_uuid, const std::map &qos); + const xrt::xclbin &xclbin, const std::map &qos); + std::unique_ptr create_hw_context(const xrt::xclbin &xclbin); + std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, uint32_t size); size_t write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, @@ -59,7 +59,6 @@ struct device { std::unique_ptr create_fence(fence_handle::access_mode); std::unique_ptr import_fence(pid_t, int); - void record_xclbin(const xrt::xclbin &xclbin); }; std::string read_sysfs(const std::string &filename); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index e533a26d8..2deefa14b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -29,8 +29,8 @@ std::vector get_pdi(const xrt_core::xclbin::aie_partition_obj &aie, namespace shim_xdna { -hw_ctx::hw_ctx(device &dev, const qos_t &qos, std::unique_ptr q, - const xrt::xclbin &xclbin) +hw_ctx::hw_ctx(device &dev, const std::map &qos, + std::unique_ptr q, const xrt::xclbin &xclbin) : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { shim_debug("Creating HW context..."); @@ -82,10 +82,10 @@ hw_ctx::~hw_ctx() { shim_debug("Destroying KMQ HW context (%d)...", m_handle); } -cuidx_type hw_ctx::open_cu_context(const std::string &cu_name) { +cuidx_t hw_ctx::open_cu_context(const std::string &cu_name) { for (uint32_t i = 0; i < m_cu_info.size(); i++) { auto &ci = m_cu_info[i]; - if (ci.m_name == cu_name) return cuidx_type{.index = i}; + if (ci.m_name == cu_name) return cuidx_t{.index = i}; } shim_err(ENOENT, "CU name (%s) not found", cu_name.c_str()); @@ -138,7 +138,7 @@ void hw_ctx::init_log_buf() { shim_xcl_bo_flags f; f.flags = XCL_BO_FLAGS_EXECBUF; m_log_bo = alloc_bo(nullptr, log_buf_size, f); - m_log_buf = m_log_bo->map(map_type::write); + m_log_buf = m_log_bo->map(); std::memset(m_log_buf, 0, log_buf_size); } @@ -146,7 +146,8 @@ void hw_ctx::fini_log_buf() const { if (m_log_bo) m_log_bo->unmap(m_log_buf); } -hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, const qos_t &qos) +hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, + const std::map &qos) : hw_ctx(device, qos, std::make_unique(device), xclbin) { create_ctx_on_device(); std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + @@ -163,7 +164,7 @@ hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, const qos_t &qos) m_pdi_bos.push_back(alloc_bo(ci.m_pdi.size(), f)); std::unique_ptr &pdi_bo = m_pdi_bos[i]; - char *pdi_vaddr = reinterpret_cast(pdi_bo->map(map_type::write)); + char *pdi_vaddr = reinterpret_cast(pdi_bo->map()); // see cu_configs[1] in amdxdna_hwctx_param_config_cu assert(i < 1 && "only 1 CU supported"); @@ -194,9 +195,9 @@ std::unique_ptr hw_ctx::alloc_bo(void *userptr, size_t size, return m_device.alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); } -std::unique_ptr create_hw_context(device &dev, - const xrt::xclbin &xclbin, - const hw_ctx::qos_t &qos) { +std::unique_ptr create_hw_context( + device &dev, const xrt::xclbin &xclbin, + const std::map &qos) { return std::make_unique(dev, xclbin, qos); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index 0d25824f1..a145beda5 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -21,7 +21,7 @@ struct cu_info { std::vector m_pdi; }; -struct cuidx_type { +struct cuidx_t { union { std::uint32_t index; struct { @@ -36,7 +36,6 @@ struct cuidx_type { }; struct hw_ctx { - using qos_t = std::map; enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; device &m_device; uint32_t m_handle = AMDXDNA_INVALID_CTX_HANDLE; @@ -50,9 +49,10 @@ struct hw_ctx { void *m_log_buf; std::vector> m_pdi_bos; - hw_ctx(device &dev, const qos_t &qos, std::unique_ptr q, - const xrt::xclbin &xclbin); - hw_ctx(device &dev, const xrt::xclbin &xclbin, const qos_t &qos); + hw_ctx(device &dev, const std::map &qos, + std::unique_ptr q, const xrt::xclbin &xclbin); + hw_ctx(device &dev, const xrt::xclbin &xclbin, + const std::map &qos); ~hw_ctx(); // TODO @@ -60,7 +60,8 @@ struct hw_ctx { shim_xcl_bo_flags flags); std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); - cuidx_type open_cu_context(const std::string &cuname); + + cuidx_t open_cu_context(const std::string &cuname); void create_ctx_on_device(); void init_log_buf(); void fini_log_buf() const; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp index d41b7c47e..4b0cacb95 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -11,8 +11,7 @@ namespace { ert_packet *get_chained_command_pkt(shim_xdna::bo *boh) { - auto cmdpkt = - reinterpret_cast(boh->map(shim_xdna::map_type::write)); + ert_packet *cmdpkt = reinterpret_cast(boh->map()); return cmdpkt->opcode == ERT_CMD_CHAIN ? cmdpkt : nullptr; } @@ -97,8 +96,7 @@ void hw_q::issue_command(bo *cmd_bo) { } int poll_command(bo *cmd) { - auto cmdpkt = reinterpret_cast(cmd->map(map_type::write)); - + ert_packet *cmdpkt = reinterpret_cast(cmd->map()); if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) { return 1; } From 89a4a01217afeb64cd7f9480e30ea0ded4aca055 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 10 Oct 2024 23:17:53 -0400 Subject: [PATCH 09/35] command buffer --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 29 ++-- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 160 +++++++++--------- .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 26 ++- .../driver/xrt-lite/command_buffer.cc | 13 +- .../driver/xrt-lite/executable.cc | 23 +-- .../iree-amd-aie/driver/xrt-lite/executable.h | 3 +- .../driver/xrt-lite/nop_executable_cache.h | 2 +- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 3 +- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 4 +- 9 files changed, 141 insertions(+), 122 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 25e45939c..2211d2103 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -125,26 +125,9 @@ struct iree_hal_xrt_lite_allocator { this->query_buffer_compatibility(&compat_params, &allocation_size); if (!iree_all_bits_set(compatibility, IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { - // TODO(benvanik): make a helper for this. -#if IREE_STATUS_MODE - iree_bitfield_string_temp_t temp0, temp1, temp2; - iree_string_view_t memory_type_str = - iree_hal_memory_type_format(params->type, &temp0); - iree_string_view_t usage_str = - iree_hal_buffer_usage_format(params->usage, &temp1); - iree_string_view_t compatibility_str = - iree_hal_buffer_compatibility_format(compatibility, &temp2); - return iree_make_status( - IREE_STATUS_INVALID_ARGUMENT, - "allocator cannot allocate a buffer with the given parameters; " - "memory_type=%.*s, usage=%.*s, compatibility=%.*s", - (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, - usage_str.data, (int)compatibility_str.size, compatibility_str.data); -#else return iree_make_status( IREE_STATUS_INVALID_ARGUMENT, "allocator cannot allocate a buffer with the given parameters"); -#endif // IREE_STATUS_MODE } // TODO(null): allocate the underlying device memory. @@ -156,10 +139,18 @@ struct iree_hal_xrt_lite_allocator { // just wrapping those device pointers in the HAL buffer type. Other // implementations that require more tracking can provide their own buffer // types that do such tracking for them. - (void)this; + + uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; + // if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { + // flags = XCL_BO_FLAGS_CACHEABLE; + // } else if (iree_all_bits_set(params->type, + // IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE)) { + // // TODO(max): the test here isn't specific enough + // flags = XCL_BO_FLAGS_EXECBUF; + // } std::unique_ptr bo = - shim_device->alloc_bo(allocation_size, XCL_BO_FLAGS_HOST_ONLY); + shim_device->alloc_bo(allocation_size, flags); iree_hal_buffer_t* buffer = nullptr; iree_status_t status = iree_hal_xrt_lite_buffer_wrap( std::move(bo), reinterpret_cast(this), diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 0b1c62523..7f92c9811 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -9,92 +9,87 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -struct iree_hal_xrt_lite_buffer { - iree_hal_buffer_t base; - std::unique_ptr bo; - iree_hal_buffer_release_callback_t release_callback; - - iree_status_t map_range(iree_hal_mapping_mode_t mapping_mode, - iree_hal_memory_access_t memory_access, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( - iree_hal_buffer_memory_type( - reinterpret_cast(this)), - IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); - IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( - iree_hal_buffer_allowed_usage( - reinterpret_cast(this)), - mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT - ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT - : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); - - // TODO(null): perform mapping as described. Note that local-to-buffer range - // adjustment may be required. The resulting mapping is populated with - // standard information such as contents indicating the host addressable - // memory range of the mapped buffer and implementation-specific information - // if additional resources are required. iree_hal_buffer_emulated_map_range - // can be used by implementations that have no way of providing host - // pointers at a large cost (alloc + device->host transfer on map and - // host->device transfer + dealloc on umap). Try not to use that. - void* host_ptr = this->bo->map(); - IREE_ASSERT(host_ptr != - nullptr); // Should be guaranteed by previous checks. - uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; - iree_status_t status = - this->invalidate_range(local_byte_offset, local_byte_length); - // If we mapped for discard scribble over the bytes. This is not a mandated - // behavior but it will make debugging issues easier. Alternatively for heap - // buffers we could reallocate them such that ASAN yells, but that would - // only work if the entire buffer was discarded. +iree_status_t iree_hal_xrt_lite_buffer::map_range( + iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type( + reinterpret_cast(this)), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( + iree_hal_buffer_allowed_usage( + reinterpret_cast(this)), + mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT + ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT + : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + + // TODO(null): perform mapping as described. Note that local-to-buffer range + // adjustment may be required. The resulting mapping is populated with + // standard information such as contents indicating the host addressable + // memory range of the mapped buffer and implementation-specific information + // if additional resources are required. iree_hal_buffer_emulated_map_range + // can be used by implementations that have no way of providing host + // pointers at a large cost (alloc + device->host transfer on map and + // host->device transfer + dealloc on umap). Try not to use that. + void* host_ptr = this->bo->map(); + IREE_ASSERT(host_ptr != nullptr); // Should be guaranteed by previous checks. + uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; + iree_status_t status = + this->invalidate_range(local_byte_offset, local_byte_length); + // If we mapped for discard scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for heap + // buffers we could reallocate them such that ASAN yells, but that would + // only work if the entire buffer was discarded. #ifndef NDEBUG - if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { - memset(data_ptr, 0xCD, local_byte_length); - } -#endif // !NDEBUG - mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); - return status; + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); } +#endif // !NDEBUG + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + return status; +} - iree_status_t unmap_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - // TODO(null): reverse of map_range. Note that cache invalidation is - // explicit via invalidate_range and need not be performed here. If using - // emulated mapping this must call iree_hal_buffer_emulated_unmap_range to - // release the transient resources. - return this->flush_range(local_byte_offset, local_byte_length); - } +iree_status_t iree_hal_xrt_lite_buffer::unmap_range( + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + // TODO(null): reverse of map_range. Note that cache invalidation is + // explicit via invalidate_range and need not be performed here. If using + // emulated mapping this must call iree_hal_buffer_emulated_unmap_range to + // release the transient resources. + return this->flush_range(local_byte_offset, local_byte_length); +} - iree_status_t invalidate_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - // TODO(null): invalidate the range if required by the buffer. Writes on the - // device are expected to be visible to the host after this returns. - if (IREE_UNLIKELY(!this->bo)) { - return iree_make_status( - IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have device memory attached and cannot be mapped"); - } - this->bo->sync(shim_xdna::direction::device2host, local_byte_length, - local_byte_offset); - return iree_ok_status(); +iree_status_t iree_hal_xrt_lite_buffer::invalidate_range( + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // TODO(null): invalidate the range if required by the buffer. Writes on the + // device are expected to be visible to the host after this returns. + if (IREE_UNLIKELY(!this->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); } + this->bo->sync(shim_xdna::direction::device2host, local_byte_length, + local_byte_offset); + return iree_ok_status(); +} - iree_status_t flush_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - // TODO(null): flush the range if required by the buffer. Writes on the - // host are expected to be visible to the device after this returns. - if (IREE_UNLIKELY(!this->bo)) { - return iree_make_status( - IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have device memory attached and cannot be mapped"); - } - this->bo->sync(shim_xdna::direction::host2device, local_byte_length, - local_byte_offset); - return iree_ok_status(); +iree_status_t iree_hal_xrt_lite_buffer::flush_range( + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // TODO(null): flush the range if required by the buffer. Writes on the + // host are expected to be visible to the device after this returns. + if (IREE_UNLIKELY(!this->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); } -}; + this->bo->sync(shim_xdna::direction::host2device, local_byte_length, + local_byte_offset); + return iree_ok_status(); +} namespace { extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; @@ -151,6 +146,13 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { IREE_TRACE_ZONE_END(z0); } +std::unique_ptr iree_hal_xrt_lite_buffer_unwrap( + iree_hal_buffer_t* base_buffer) { + iree_hal_xrt_lite_buffer* buffer = + reinterpret_cast(base_buffer); + return std::move(buffer->bo); +} + #define BUFFER_MEMBER_STATUS(member) \ MEMBER_WRAPPER_STATUS(iree_hal_buffer_t, iree_hal_xrt_lite_buffer, member) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h index 31849a30d..c89f14164 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -11,7 +11,28 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -// Wraps an allocation in an iree_hal_buffer_t. +struct iree_hal_xrt_lite_buffer { + iree_hal_buffer_t base; + std::unique_ptr bo; + iree_hal_buffer_release_callback_t release_callback; + + iree_status_t map_range(iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping); + + iree_status_t unmap_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping); + + iree_status_t invalidate_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length); + + iree_status_t flush_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length); +}; + iree_status_t iree_hal_xrt_lite_buffer_wrap( std::unique_ptr bo, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, @@ -20,4 +41,7 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( iree_hal_buffer_release_callback_t release_callback, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); +std::unique_ptr iree_hal_xrt_lite_buffer_unwrap( + iree_hal_buffer_t* base_buffer); + #endif // IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc index 59e01fdba..52e124af2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc @@ -6,7 +6,11 @@ #include "iree-amd-aie/driver/xrt-lite/command_buffer.h" +#include "buffer.h" #include "iree-amd-aie/driver/xrt-lite/util.h" +#include "shim/linux/kmq/bo.h" + +#define MAX_EXEC_BO_SIZE (4096) namespace { extern const iree_hal_command_buffer_vtable_t @@ -16,6 +20,7 @@ extern const iree_hal_command_buffer_vtable_t struct iree_hal_xrt_lite_command_buffer { iree_hal_command_buffer_t base; iree_allocator_t host_allocator; + iree_hal_buffer_t* exec_buffer; iree_status_t begin() { // TODO(null): if the implementation needs to route the begin to the @@ -252,8 +257,10 @@ iree_status_t iree_hal_xrt_lite_command_buffer_create( // iree_arena_t/block pools. Implementations should also retain any resources // used during the recording and can use iree_hal_resource_set_t* to make that // easier. - iree_status_t status = iree_make_status( - IREE_STATUS_UNIMPLEMENTED, "command buffers not yet implemented"); + iree_hal_buffer_params_t params; + params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE; + iree_status_t status = iree_hal_allocator_allocate_buffer( + device_allocator, params, MAX_EXEC_BO_SIZE, &command_buffer->exec_buffer); if (iree_status_is_ok(status)) { *out_command_buffer = &command_buffer->base; @@ -273,7 +280,7 @@ static void iree_hal_xrt_lite_command_buffer_destroy( // TODO(null): release any implementation resources and // iree_hal_resource_set_t. - + iree_hal_buffer_destroy(command_buffer->exec_buffer); iree_allocator_free(host_allocator, command_buffer); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index 02180a879..fdb48b0d8 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -14,8 +14,6 @@ #include "iree-amd-aie/schemas/xrt_executable_def_verifier.h" #include "iree/base/api.h" -#define MAX_EXEC_BO_SIZE (4096) - struct iree_hal_xrt_lite_native_executable_t { // Abstract resource used for injecting reference counting and vtable; must be // at offset 0. @@ -33,7 +31,7 @@ extern const iree_hal_executable_vtable_t static iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast(iree_hal_executable_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); - return (iree_hal_xrt_lite_native_executable_t*)base_value; + return reinterpret_cast(base_value); } // Verifies the structure of the flatbuffer so that we can avoid doing so during @@ -151,12 +149,11 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( entry_point_count * sizeof(executable->entry_points[0]) + total_entry_point_name_chars; IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, - iree_allocator_malloc(host_allocator, total_size, (void**)&executable)); - IREE_TRACE( - char* string_table_buffer = - (char*)((char*)executable + sizeof(*executable) + - entry_point_count * sizeof(executable->entry_points[0]))); + z0, iree_allocator_malloc(host_allocator, total_size, + reinterpret_cast(&executable))); + IREE_TRACE(char* string_table_buffer = reinterpret_cast( + reinterpret_cast(executable) + sizeof(*executable) + + entry_point_count * sizeof(executable->entry_points[0]))); iree_hal_resource_initialize(&iree_hal_xrt_lite_native_executable_vtable, &executable->resource); @@ -180,8 +177,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); xrt::xclbin xclbin = xrt::xclbin(xclbinVector); - std::unique_ptr hw_ctx = - shim_device->create_hw_context(xclbin); + params->context = shim_device->create_hw_context(xclbin); uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); @@ -194,9 +190,6 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( size_t ctrl_code_size = num_instr * sizeof(uint32_t); params->bo_ctrl_code = shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); - params->bo_exec_buf = - shim_device->alloc_bo(MAX_EXEC_BO_SIZE, XCL_BO_FLAGS_EXECBUF); - uint32_t* instr_buffer = static_cast(params->bo_ctrl_code->map()); memcpy(instr_buffer, asm_inst, ctrl_code_size); @@ -231,7 +224,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( }); } - *out_executable = (iree_hal_executable_t*)executable; + *out_executable = reinterpret_cast(executable); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index 7310a103b..b70c266cc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -24,7 +24,6 @@ extern "C" { struct iree_hal_xrt_lite_kernel_params_t { std::unique_ptr context; std::unique_ptr bo_ctrl_code; - std::unique_ptr bo_exec_buf; // Number of assembly instructions argument to the kernel uint32_t num_instr; // number of instructions IREE_TRACE(iree_string_view_t kernel_name;) @@ -45,7 +44,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( iree_hal_xrt_lite_kernel_params_t* out_params); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h index 8b0ed658e..251119fdd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -27,7 +27,7 @@ iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( iree_hal_executable_cache_t** out_executable_cache); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index f373e9b77..0e521e4f2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -128,8 +128,9 @@ pdev::~pdev() { } void pdev::ioctl(unsigned long cmd, void *arg) const { - if (::ioctl(m_dev_fd, cmd, arg) == -1) + if (::ioctl(m_dev_fd, cmd, arg) == -1) { shim_err(errno, "%s IOCTL failed", ioctl_cmd2name(cmd).c_str()); + } } void *pdev::mmap(void *addr, size_t len, int prot, int flags, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp index 4b0cacb95..868b2eb0e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -71,7 +71,9 @@ void hw_q::submit_wait(const std::vector &fences) { void hw_q::submit_signal(const fence_handle *f) { f->submit_signal(m_hwctx); } -hw_q::~hw_q() { shim_debug("Destroying KMQ HW queue"); } +hw_q::~hw_q() { + shim_debug("Destroying KMQ HW queue"); +} void hw_q::issue_command(bo *cmd_bo) { // Assuming 1024 max args per cmd bo From f66ed50ef853c6b2abef9cfc17e6eb3d2fa79503 Mon Sep 17 00:00:00 2001 From: makslevental Date: Fri, 11 Oct 2024 17:30:48 -0400 Subject: [PATCH 10/35] e2e --- build_tools/build_test_cpp.ps1 | 2 +- build_tools/build_test_cpp.sh | 2 +- iree_compiler_plugin.cmake | 8 +- .../driver/xrt-lite/CMakeLists.txt | 8 +- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 11 +- .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 3 +- .../driver/xrt-lite/command_buffer.cc | 341 ---------------- .../driver/xrt-lite/command_buffer.h | 25 -- .../driver/xrt-lite/cts/CMakeLists.txt | 53 ++- .../xrt-lite/cts/executable_cache_test.mlir | 22 +- ...spatch_test.cc => matmul_dispatch_test.cc} | 18 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 117 +++++- .../src/iree-amd-aie/driver/xrt-lite/device.h | 17 - .../driver/xrt-lite/direct_command_buffer.cc | 371 ++++++++++++++++++ .../driver/xrt-lite/direct_command_buffer.h | 32 ++ .../driver/xrt-lite/executable.cc | 37 +- .../iree-amd-aie/driver/xrt-lite/executable.h | 8 +- .../driver/xrt-lite/nop_executable_cache.cc | 1 + .../driver/xrt-lite/nop_semaphore.cc | 115 ++++++ .../driver/xrt-lite/nop_semaphore.h | 27 ++ .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 27 +- .../driver/xrt-lite/shim/linux/kmq/bo.h | 13 +- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 5 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 1 + .../iree-amd-aie/driver/xrt/CMakeLists.txt | 2 + .../driver/xrt/cts/CMakeLists.txt | 111 ++++++ .../driver/xrt/cts/executable_cache_test.cc | 85 ++++ .../driver/xrt/cts/executable_cache_test.mlir | 33 ++ .../driver/xrt/cts/matmul_dispatch_test.cc | 224 +++++++++++ 29 files changed, 1254 insertions(+), 465 deletions(-) delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h rename runtime/src/iree-amd-aie/driver/xrt-lite/cts/{command_buffer_dispatch_test.cc => matmul_dispatch_test.cc} (94%) delete mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/device.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index 9d214a5db..4a579cbd6 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -124,7 +124,7 @@ echo "-----" # better have git-bash installed... $env:Path = "C:\Program Files\Git\bin;$env:Path" pushd $build_dir -& bash -l -c "ctest -R amd-aie -E driver/xrt-lite --output-on-failure -j --repeat until-pass:5" +& bash -l -c "ctest -R amd-aie -E driver --output-on-failure -j --repeat until-pass:5" popd if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index e4a0a661e..612f5999a 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -143,7 +143,7 @@ cmake --build "$build_dir" --target iree-install-dist echo "CTest" echo "-----" if [[ "$OSTYPE" == "linux"* ]]; then - ctest --test-dir "$build_dir" -R amd-aie -E "driver/xrt-lite" --output-on-failure -j + ctest --test-dir "$build_dir" -R amd-aie -E "driver" --output-on-failure -j elif [[ "$OSTYPE" == "darwin"* ]]; then ctest --test-dir "$build_dir" -R amd-aie -E "matmul_pack_peel_air_e2e|matmul_elementwise_pack_peel_air_e2e|conv_fill_spec_pad" --output-on-failure -j --repeat until-pass:5 fi diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index 958d6de46..3b50361c8 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -17,7 +17,13 @@ if("xrt" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) set(IREE_AMD_AIE_ENABLE_XRT_DRIVER ON) endif() -if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) +set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER OFF) +if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) + message(STATUS "Enabling XRT-LITE build because it is an enabled HAL driver") + set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) +endif() + +if(IREE_AMD_AIE_ENABLE_XRT_DRIVER OR IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) include(iree_aie_xrt) endif() include(iree_aie_bootgen) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index 9fcdb521f..d5e5ecea1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -25,18 +25,22 @@ iree_cc_library( api.h buffer.cc buffer.h - command_buffer.cc - command_buffer.h + direct_command_buffer.cc + direct_command_buffer.h device.cc driver.cc executable.cc executable.h nop_executable_cache.cc nop_executable_cache.h + nop_semaphore.cc + nop_semaphore.h util.h DEPS iree::base iree::base::core_headers + iree::hal::utils::deferred_command_buffer + iree::hal::utils::semaphore_base iree::base::internal::flatcc::parsing iree-amd-aie::schemas::xrt_executable_def_c_fbs iree-amd-aie::driver::xrt-lite::shim::linux::kmq::shim-xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 7f92c9811..a78ee9aa2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -71,8 +71,7 @@ iree_status_t iree_hal_xrt_lite_buffer::invalidate_range( IREE_STATUS_FAILED_PRECONDITION, "buffer does not have device memory attached and cannot be mapped"); } - this->bo->sync(shim_xdna::direction::device2host, local_byte_length, - local_byte_offset); + this->bo->sync(shim_xdna::direction::device2host); return iree_ok_status(); } @@ -86,8 +85,7 @@ iree_status_t iree_hal_xrt_lite_buffer::flush_range( IREE_STATUS_FAILED_PRECONDITION, "buffer does not have device memory attached and cannot be mapped"); } - this->bo->sync(shim_xdna::direction::host2device, local_byte_length, - local_byte_offset); + this->bo->sync(shim_xdna::direction::host2device); return iree_ok_status(); } @@ -146,11 +144,10 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { IREE_TRACE_ZONE_END(z0); } -std::unique_ptr iree_hal_xrt_lite_buffer_unwrap( - iree_hal_buffer_t* base_buffer) { +shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer) { iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); - return std::move(buffer->bo); + return buffer->bo.get(); } #define BUFFER_MEMBER_STATUS(member) \ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h index c89f14164..c6f34b7b9 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -41,7 +41,6 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( iree_hal_buffer_release_callback_t release_callback, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); -std::unique_ptr iree_hal_xrt_lite_buffer_unwrap( - iree_hal_buffer_t* base_buffer); +shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer); #endif // IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc deleted file mode 100644 index 52e124af2..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree-amd-aie/driver/xrt-lite/command_buffer.h" - -#include "buffer.h" -#include "iree-amd-aie/driver/xrt-lite/util.h" -#include "shim/linux/kmq/bo.h" - -#define MAX_EXEC_BO_SIZE (4096) - -namespace { -extern const iree_hal_command_buffer_vtable_t - iree_hal_xrt_lite_command_buffer_vtable; -} - -struct iree_hal_xrt_lite_command_buffer { - iree_hal_command_buffer_t base; - iree_allocator_t host_allocator; - iree_hal_buffer_t* exec_buffer; - - iree_status_t begin() { - // TODO(null): if the implementation needs to route the begin to the - // implementation it can be done here. Note that creation may happen much - // earlier than recording and any expensive work should be deferred until - // this point to make profiling easier. - (void)this; - iree_status_t status = - iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "command buffer recording start not implemented"); - return status; - } - - iree_status_t end() { - // TODO(null): if recording requires multiple passes any fixup/linking can - // happen here. Recording-only resources are no longer needed after this - // point and can be disposed. - (void)this; - iree_status_t status = - iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "command buffer finalization not implemented"); - return status; - } - - void begin_debug_group(iree_string_view_t label, - iree_hal_label_color_t label_color, - const iree_hal_label_location_t* location) { - // TODO(null): begin a nested debug group (push) if the implementation has a - // way to insert markers. This is informational and can be ignored. - (void)this; - } - - void end_debug_group() { - // TODO(null): end a nested debug group (pop). Always called 1:1 in stack - // order with begin_debug_group. - (void)this; - } - - iree_status_t execution_barrier( - iree_hal_execution_stage_t source_stage_mask, - iree_hal_execution_stage_t target_stage_mask, - iree_hal_execution_barrier_flags_t flags, - iree_host_size_t memory_barrier_count, - const iree_hal_memory_barrier_t* memory_barriers, - iree_host_size_t buffer_barrier_count, - const iree_hal_buffer_barrier_t* buffer_barriers) { - // TODO(null): barriers split the execution sequence into all operations - // that did happen before the barrier and all that will happen after. In - // implementations that have no concurrency this can be a no-op. This is - // effectively just a signal_event followed by a wait_event. - (void)this; - iree_status_t status = iree_make_status( - IREE_STATUS_UNIMPLEMENTED, "execution barriers not implemented"); - return status; - } - - iree_status_t signal_event(iree_hal_event_t* event, - iree_hal_execution_stage_t source_stage_mask) { - // TODO(null): WIP API and may change; signals the given event allowing - // waiters to proceed. - (void)this; - iree_status_t status = - iree_make_status(IREE_STATUS_UNIMPLEMENTED, "events not implemented"); - return status; - } - - iree_status_t reset_event(iree_hal_event_t* event, - iree_hal_execution_stage_t source_stage_mask) { - // TODO(null): WIP API and may change; resets the given event to unsignaled. - (void)this; - iree_status_t status = - iree_make_status(IREE_STATUS_UNIMPLEMENTED, "events not implemented"); - return status; - } - - iree_status_t wait_events(iree_host_size_t event_count, - const iree_hal_event_t** events, - iree_hal_execution_stage_t source_stage_mask, - iree_hal_execution_stage_t target_stage_mask, - iree_host_size_t memory_barrier_count, - const iree_hal_memory_barrier_t* memory_barriers, - iree_host_size_t buffer_barrier_count, - const iree_hal_buffer_barrier_t* buffer_barriers) { - // TODO(null): WIP API and may change; waits on the list of events and - // enacts the specified set of barriers. Implementations without - // fine-grained tracking can treat this as an execution_barrier and ignore - // the memory/buffer barriers provided. - (void)this; - iree_status_t status = - iree_make_status(IREE_STATUS_UNIMPLEMENTED, "events not implemented"); - return status; - } - - iree_status_t discard_buffer(iree_hal_buffer_ref_t buffer_ref) { - // TODO(null): WIP API and may change; this is likely to become an - // madvise-like command that can be used to control prefetching and other - // cache behavior. The current discard behavior is a hint that the buffer - // contents will never be used again and that if they are in a cache they - // need not be written back to global memory. - (void)this; - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "discard buffer not implemented"); - return status; - } - - iree_status_t fill_buffer(iree_hal_buffer_ref_t target_ref, - const void* pattern, - iree_host_size_t pattern_length) { - // TODO(null): memset on the buffer. The pattern_length is 1, 2, or 4 bytes. - // Note that the buffer may be a reference to a binding table slot in which - // case it will be provided during submission to a queue. - (void)this; - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "fill buffer not implemented"); - return status; - } - - iree_status_t update_buffer(const void* source_buffer, - iree_host_size_t source_offset, - iree_hal_buffer_ref_t target_ref) { - // TODO(null): embed and copy a small (~64KB) chunk of host memory to the - // target buffer. The source_buffer contents must be captured as they may - // change/be freed after this call completes. - // Note that the target buffer may be a reference to a binding table slot in - // which case it will be provided during submission to a queue. - (void)this; - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "update buffer not implemented"); - - return status; - } - - iree_status_t copy_buffer(iree_hal_buffer_ref_t source_ref, - iree_hal_buffer_ref_t target_ref) { - // TODO(null): memcpy between two buffers. The buffers must both be - // device-visible but may reside on either the host or device. - // Note that either buffer may be a reference to a binding table slot in - // which case it will be provided during submission to a queue. - (void)this; - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "copy buffer not implemented"); - - return status; - } - - iree_status_t collective(iree_hal_channel_t* channel, - iree_hal_collective_op_t op, uint32_t param, - iree_hal_buffer_ref_t send_ref, - iree_hal_buffer_ref_t recv_ref, - iree_device_size_t element_count) { - // TODO(null): perform the collective operation defined by op. See the - // headers for more information. The channel is fixed for a particular - // recording but note that either buffer may be a reference to a binding - // table slot in which case it will be provided during submission to a - // queue. - (void)this; - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "collectives not implemented"); - - return status; - } - - iree_status_t dispatch(iree_hal_executable_t* executable, int32_t entry_point, - const uint32_t workgroup_count[3], - iree_const_byte_span_t constants, - iree_hal_buffer_ref_list_t bindings, - iree_hal_dispatch_flags_t flags) { - // TODO(null): dispatch the specified executable entry point with the given - // workgroup count. The constants must be copied into the command buffer as - // they may be mutated or freed after this call returns. - // Note that any of the bindings may be references to binding table slots in - // which case they will be provided during submission to a queue. - (void)this; - iree_status_t status = - iree_make_status(IREE_STATUS_UNIMPLEMENTED, "dispatch not implemented"); - - return status; - } - - iree_status_t dispatch_indirect(iree_hal_executable_t* executable, - int32_t entry_point, - iree_hal_buffer_ref_t workgroups_ref, - iree_const_byte_span_t constants, - iree_hal_buffer_ref_list_t bindings, - iree_hal_dispatch_flags_t flags) { - // TODO(null): dispatch the specified executable entry point with a - // workgroup count that is stored in the given workgroup count buffer as a - // uint32_t[3]. The workgroup count may change up until immediately prior to - // the dispatch. The constants must be copied into the command buffer as - // they may be mutated or freed after this call returns. Note that any of - // the bindings may be references to binding table slots in which case they - // will be provided during submission to a queue. - (void)this; - iree_status_t status = iree_make_status( - IREE_STATUS_UNIMPLEMENTED, "indirect dispatch not implemented"); - - return status; - } -}; - -static iree_hal_xrt_lite_command_buffer* iree_hal_xrt_lite_command_buffer_cast( - iree_hal_command_buffer_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_command_buffer_vtable); - return (iree_hal_xrt_lite_command_buffer*)base_value; -} - -iree_status_t iree_hal_xrt_lite_command_buffer_create( - iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, - iree_hal_command_category_t command_categories, - iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, - iree_allocator_t host_allocator, - iree_hal_command_buffer_t** out_command_buffer) { - IREE_ASSERT_ARGUMENT(out_command_buffer); - *out_command_buffer = nullptr; - IREE_TRACE_ZONE_BEGIN(z0); - - iree_hal_xrt_lite_command_buffer* command_buffer = nullptr; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, - iree_allocator_malloc(host_allocator, - sizeof(*command_buffer) + - iree_hal_command_buffer_validation_state_size( - mode, binding_capacity), - (void**)&command_buffer)); - iree_hal_command_buffer_initialize( - device_allocator, mode, command_categories, queue_affinity, - binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer), - &iree_hal_xrt_lite_command_buffer_vtable, &command_buffer->base); - command_buffer->host_allocator = host_allocator; - - // TODO(null): allocate any additional resources for managing command buffer - // state. Some implementations may have their own command buffer/command list - // APIs this can route to or may need to implement it all themselves using - // iree_arena_t/block pools. Implementations should also retain any resources - // used during the recording and can use iree_hal_resource_set_t* to make that - // easier. - iree_hal_buffer_params_t params; - params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE; - iree_status_t status = iree_hal_allocator_allocate_buffer( - device_allocator, params, MAX_EXEC_BO_SIZE, &command_buffer->exec_buffer); - - if (iree_status_is_ok(status)) { - *out_command_buffer = &command_buffer->base; - } else { - iree_hal_command_buffer_release(&command_buffer->base); - } - IREE_TRACE_ZONE_END(z0); - return status; -} - -static void iree_hal_xrt_lite_command_buffer_destroy( - iree_hal_command_buffer_t* base_command_buffer) { - iree_hal_xrt_lite_command_buffer* command_buffer = - iree_hal_xrt_lite_command_buffer_cast(base_command_buffer); - iree_allocator_t host_allocator = command_buffer->host_allocator; - IREE_TRACE_ZONE_BEGIN(z0); - - // TODO(null): release any implementation resources and - // iree_hal_resource_set_t. - iree_hal_buffer_destroy(command_buffer->exec_buffer); - iree_allocator_free(host_allocator, command_buffer); - - IREE_TRACE_ZONE_END(z0); -} - -bool iree_hal_xrt_lite_command_buffer_isa( - iree_hal_command_buffer_t* command_buffer) { - return iree_hal_resource_is(&command_buffer->resource, - &iree_hal_xrt_lite_command_buffer_vtable); -} - -#define COMMAND_BUFFER_MEMBER(member, return_t) \ - MEMBER_WRAPPER(iree_hal_command_buffer_t, iree_hal_xrt_lite_command_buffer, \ - member, return_t) -#define COMMAND_BUFFER_MEMBER_STATUS(member) \ - MEMBER_WRAPPER_STATUS(iree_hal_command_buffer_t, \ - iree_hal_xrt_lite_command_buffer, member) -#define COMMAND_BUFFER_MEMBER_VOID(member) \ - MEMBER_WRAPPER_VOID(iree_hal_command_buffer_t, \ - iree_hal_xrt_lite_command_buffer, member) - -COMMAND_BUFFER_MEMBER_STATUS(begin); -COMMAND_BUFFER_MEMBER_STATUS(end); -COMMAND_BUFFER_MEMBER_VOID(begin_debug_group); -COMMAND_BUFFER_MEMBER_VOID(end_debug_group); -COMMAND_BUFFER_MEMBER_STATUS(execution_barrier); -COMMAND_BUFFER_MEMBER_STATUS(signal_event); -COMMAND_BUFFER_MEMBER_STATUS(reset_event); -COMMAND_BUFFER_MEMBER_STATUS(wait_events); -COMMAND_BUFFER_MEMBER_STATUS(discard_buffer); -COMMAND_BUFFER_MEMBER_STATUS(fill_buffer); -COMMAND_BUFFER_MEMBER_STATUS(update_buffer); -COMMAND_BUFFER_MEMBER_STATUS(copy_buffer); -COMMAND_BUFFER_MEMBER_STATUS(collective); -COMMAND_BUFFER_MEMBER_STATUS(dispatch); -COMMAND_BUFFER_MEMBER_STATUS(dispatch_indirect); - -namespace { -const iree_hal_command_buffer_vtable_t iree_hal_xrt_lite_command_buffer_vtable = - { - .destroy = iree_hal_xrt_lite_command_buffer_destroy, - .begin = iree_hal_xrt_lite_command_buffer_begin, - .end = iree_hal_xrt_lite_command_buffer_end, - .begin_debug_group = iree_hal_xrt_lite_command_buffer_begin_debug_group, - .end_debug_group = iree_hal_xrt_lite_command_buffer_end_debug_group, - .execution_barrier = iree_hal_xrt_lite_command_buffer_execution_barrier, - .signal_event = iree_hal_xrt_lite_command_buffer_signal_event, - .reset_event = iree_hal_xrt_lite_command_buffer_reset_event, - .wait_events = iree_hal_xrt_lite_command_buffer_wait_events, - .discard_buffer = iree_hal_xrt_lite_command_buffer_discard_buffer, - .fill_buffer = iree_hal_xrt_lite_command_buffer_fill_buffer, - .update_buffer = iree_hal_xrt_lite_command_buffer_update_buffer, - .copy_buffer = iree_hal_xrt_lite_command_buffer_copy_buffer, - .collective = iree_hal_xrt_lite_command_buffer_collective, - .dispatch = iree_hal_xrt_lite_command_buffer_dispatch, - .dispatch_indirect = iree_hal_xrt_lite_command_buffer_dispatch_indirect, -}; -} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h deleted file mode 100644 index 7283582bf..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_HAL_DRIVERS_XRT_LITE_COMMAND_BUFFER_H_ -#define IREE_HAL_DRIVERS_XRT_LITE_COMMAND_BUFFER_H_ - -#include "iree/base/api.h" -#include "iree/hal/api.h" - -// Creates {Null} command buffer. -iree_status_t iree_hal_xrt_lite_command_buffer_create( - iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, - iree_hal_command_category_t command_categories, - iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, - iree_allocator_t host_allocator, - iree_hal_command_buffer_t** out_command_buffer); - -// Returns true if |command_buffer| is a {Null} command buffer. -bool iree_hal_xrt_lite_command_buffer_isa( - iree_hal_command_buffer_t* command_buffer); - -#endif // IREE_HAL_DRIVERS_XRT_LITE_COMMAND_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index 8ed1891b0..a8125ec00 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -52,6 +52,8 @@ iree_bytecode_module( --iree-amd-aie-show-invoked-commands --iree-hal-memoization=false --iree-hal-indirect-command-buffers=false + DEPS + iree-aie-xclbinutil PUBLIC TESTONLY ) @@ -76,6 +78,51 @@ iree_c_embed_data( TESTONLY ) +#iree_bytecode_module( +# NAME +# xrt_lite_command_buffer_dispatch_test_module +# MODULE_FILE_NAME +# xrt_lite_command_buffer_dispatch_test.bin +# SRC +# "${CMAKE_CURRENT_LIST_DIR}/command_buffer_dispatch_test.mlir" +# FLAGS +# --compile-mode=hal-executable +# --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} +# --iree-hal-target-backends=amd-aie +# --iree-amdaie-lower-to-aie-pipeline=air +# --iree-amdaie-target-device=${TARGET_DEVICE} +# --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} +# --iree-amd-aie-vitis-install-dir=${VITIS_DIR} +# --iree-amd-aie-enable-chess=$ +# --iree-amd-aie-show-invoked-commands +# --iree-hal-memoization=false +# --iree-hal-indirect-command-buffers=false +# DEPS +# iree-aie-xclbinutil +# PUBLIC +# TESTONLY +#) +# +#iree_c_embed_data( +# NAME +# xrt_lite_command_buffer_dispatch_c +# SRCS +# xrt_lite_command_buffer_dispatch_test.bin +# C_FILE_OUTPUT +# xrt_lite_command_buffer_dispatch_c.c +# H_FILE_OUTPUT +# xrt_lite_command_buffer_dispatch_c.h +# IDENTIFIER +# iree_cts_testdata_command_buffer_dispatch_aie_xrt_lite +# STRIP_PREFIX +# xrt_lite_ +# DEPENDS +# ::xrt_lite_command_buffer_dispatch_test_module +# FLATTEN +# PUBLIC +# TESTONLY +#) + iree_cc_test( NAME xrt_lite_executable_cache_test @@ -92,9 +139,9 @@ iree_cc_test( iree_cc_test( NAME - xrt_lite_command_buffer_dispatch_test + xrt_lite_dispatch_test SRCS - command_buffer_dispatch_test.cc + matmul_dispatch_test.cc DEPS ::xrt_lite_executables_c iree-amd-aie::driver::xrt-lite::registration @@ -106,4 +153,4 @@ iree_cc_test( ) target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_executable_cache_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") -target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_command_buffer_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") +target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir index 4a27d79e0..dedbcab6b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir @@ -9,24 +9,24 @@ flags = Indirect > hal.executable.source public @amdaie_fb { - hal.executable.export public @matmul_f32_dispatch_0_matmul_256x256x32_f32 ordinal(0) layout(#pipeline_layout) { + hal.executable.export public @matmul_f32_dispatch_0_matmul_32x32x32_f32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index } builtin.module { - func.func @matmul_f32_dispatch_0_matmul_256x256x32_f32() { + func.func @matmul_f32_dispatch_0_matmul_32x32x32_f32() { %c0_f32 = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf32> - %5 = tensor.empty() : tensor<256x256xf32> - %6 = linalg.fill ins(%c0_f32 : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %5 = tensor.empty() : tensor<32x32xf32> + %6 = linalg.fill ins(%c0_f32 : f32) outs(%5 : tensor<32x32xf32>) -> tensor<32x32xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xf32> -> !flow.dispatch.tensor> return } } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/command_buffer_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc similarity index 94% rename from runtime/src/iree-amd-aie/driver/xrt-lite/cts/command_buffer_dispatch_test.cc rename to runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc index 00053e145..f00bfbddc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/command_buffer_dispatch_test.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc @@ -23,7 +23,7 @@ iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { return iree_hal_xrt_lite_driver_module_register(registry); } -const char* get_test_executable_format() { return "amdaie-pdi-fb"; } +const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { const struct iree_file_toc_t* toc = @@ -32,7 +32,7 @@ iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { return iree_make_const_byte_span(file.data, file.size); } -class CommandBufferDispatchTest +class MatMulDispatchTest : public CTSTestBase<::testing::TestWithParam> { protected: void PrepareMatmulExecutable() { @@ -75,7 +75,7 @@ int32_t generate_random_number(iree_hal_element_type_t element_type, min; } -TEST_F(CommandBufferDispatchTest, Create) { +TEST_F(MatMulDispatchTest, Create) { iree_hal_command_buffer_t* command_buffer = nullptr; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, @@ -89,7 +89,7 @@ TEST_F(CommandBufferDispatchTest, Create) { iree_hal_command_buffer_release(command_buffer); } -TEST_F(CommandBufferDispatchTest, BeginEnd) { +TEST_F(MatMulDispatchTest, BeginEnd) { iree_hal_command_buffer_t* command_buffer = nullptr; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, @@ -102,7 +102,7 @@ TEST_F(CommandBufferDispatchTest, BeginEnd) { iree_hal_command_buffer_release(command_buffer); } -TEST_F(CommandBufferDispatchTest, SubmitEmpty) { +TEST_F(MatMulDispatchTest, SubmitEmpty) { iree_hal_command_buffer_t* command_buffer = nullptr; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, @@ -117,11 +117,11 @@ TEST_F(CommandBufferDispatchTest, SubmitEmpty) { iree_hal_command_buffer_release(command_buffer); } -TEST_P(CommandBufferDispatchTest, DispatchMatmul) { +TEST_P(MatMulDispatchTest, DispatchMatmul) { PrepareMatmulExecutable(); // Create input buffer. - constexpr iree_device_size_t WIDTH = 256; + constexpr iree_device_size_t WIDTH = 32; constexpr iree_device_size_t M = WIDTH, K = WIDTH, N = WIDTH; iree_hal_buffer_t *input_A = nullptr, *input_B = nullptr, *output_C = nullptr; int32_t seed = @@ -133,7 +133,7 @@ TEST_P(CommandBufferDispatchTest, DispatchMatmul) { iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed + 1); CreateFilledDeviceBuffer(M * K * sizeof(float), a, &input_A); CreateFilledDeviceBuffer(K * N * sizeof(float), b, &input_B); - CreateFilledDeviceBuffer(M * N * sizeof(float), 0, &output_C); + CreateFilledDeviceBuffer(M * N * sizeof(float), -1, &output_C); iree_hal_buffer_ref_t binding_refs[3]; iree_hal_buffer_binding_table_t binding_table = @@ -217,7 +217,7 @@ TEST_P(CommandBufferDispatchTest, DispatchMatmul) { CleanupExecutable(); } -INSTANTIATE_TEST_SUITE_P(CommandBufferDispatchTest, CommandBufferDispatchTest, +INSTANTIATE_TEST_SUITE_P(MatMulDispatchTest, MatMulDispatchTest, ::testing::Values(RecordingType::kDirect), GenerateTestName()); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index ab3dbca70..8ee3a3085 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -4,14 +4,18 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "iree-amd-aie/driver/xrt-lite/device.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree-amd-aie/driver/xrt-lite/allocator.h" #include "iree-amd-aie/driver/xrt-lite/api.h" -#include "iree-amd-aie/driver/xrt-lite/command_buffer.h" -#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/direct_command_buffer.h" +#include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" +#include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -#include "nop_executable_cache.h" +#include "iree/hal/utils/deferred_command_buffer.h" +#include "iree/hal/utils/deferred_work_queue.h" + +#define ARENA_BLOCK_SIZE (32 * 1024) struct iree_hal_xrt_lite_device { iree_hal_resource_t resource; @@ -19,6 +23,9 @@ struct iree_hal_xrt_lite_device { iree_allocator_t host_allocator; // not used iree_hal_allocator_t* device_allocator; + // Block pool used for command buffers with a larger block size (as command + // buffers can contain inlined data uploads). + iree_arena_block_pool_t block_pool; std::shared_ptr shim_device; iree_status_t create_executable_cache( @@ -36,9 +43,89 @@ struct iree_hal_xrt_lite_device { iree_hal_command_buffer_t** out_command_buffer) { // TODO(null): pass any additional resources required to create the command // buffer. The implementation could pool command buffers here. - return iree_hal_xrt_lite_command_buffer_create( - device_allocator, mode, command_categories, queue_affinity, - binding_capacity, host_allocator, out_command_buffer); + if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "unimplmented multi-shot command buffer"); + } + return iree_hal_deferred_command_buffer_create( + device_allocator, mode, command_categories, binding_capacity, + &block_pool, host_allocator, out_command_buffer); + } + + iree_status_t create_semaphore(uint64_t initial_value, + iree_hal_semaphore_flags_t flags, + iree_hal_semaphore_t** out_semaphore) { + return iree_hal_xrt_lite_semaphore_create(host_allocator, initial_value, + out_semaphore); + } + + iree_status_t queue_execute( + iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_host_size_t command_buffer_count, + iree_hal_command_buffer_t* const* command_buffers, + iree_hal_buffer_binding_table_t const* binding_tables) { + IREE_TRACE_ZONE_BEGIN(z0); + + for (iree_host_size_t i = 0; i < command_buffer_count; i++) { + iree_hal_command_buffer_t* xrt_command_buffer = nullptr; + iree_hal_command_buffer_mode_t mode = + IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | + IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION | + IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_direct_command_buffer_create( + shim_device, device_allocator, mode, + IREE_HAL_COMMAND_CATEGORY_ANY, + /*binding_capacity=*/0, &block_pool, host_allocator, + &xrt_command_buffer)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_deferred_command_buffer_apply( + command_buffers[i], xrt_command_buffer, + iree_hal_buffer_binding_table_empty())); + } + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); + } + + void replace_device_allocator(iree_hal_allocator_t* new_allocator) { + iree_hal_allocator_retain(new_allocator); + iree_hal_allocator_release(this->device_allocator); + this->device_allocator = new_allocator; + } + + iree_status_t query_i64(iree_string_view_t category, iree_string_view_t key, + int64_t* out_value) { + *out_value = 0; + if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) { + *out_value = + iree_string_view_match_pattern(this->identifier, key) ? 1 : 0; + return iree_ok_status(); + } + + if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) { + *out_value = + iree_string_view_equal(key, IREE_SV("amdaie-xclbin-fb")) ? 1 : 0; + return iree_ok_status(); + } + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); + } + + iree_status_t queue_alloca( + iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + // TODO: queue-ordered allocations. + IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, + iree_infinite_timeout())); + IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( + device_allocator, params, allocation_size, out_buffer)); + IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list)); + return iree_ok_status(); } }; @@ -82,6 +169,8 @@ iree_status_t iree_hal_xrt_lite_device_create( // from the same driver. iree_status_t status = iree_hal_xrt_lite_allocator_create( host_allocator, device->shim_device, &device->device_allocator); + iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, + &device->block_pool); // TODO(max): device id *out_device = reinterpret_cast(device); if (iree_status_is_ok(status)) { @@ -143,6 +232,11 @@ static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( DEVICE_MEMBER_STATUS(create_executable_cache); DEVICE_MEMBER_STATUS(create_command_buffer); +DEVICE_MEMBER_STATUS(create_semaphore); +DEVICE_MEMBER_STATUS(queue_execute); +DEVICE_MEMBER_STATUS(query_i64); +DEVICE_MEMBER_STATUS(queue_alloca); +DEVICE_MEMBER_VOID(replace_device_allocator); namespace { const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { @@ -150,6 +244,13 @@ const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { .id = iree_hal_xrt_lite_device_id, .host_allocator = iree_hal_xrt_lite_device_host_allocator, .device_allocator = iree_hal_xrt_lite_device_device_allocator, + .replace_device_allocator = + iree_hal_xrt_lite_device_replace_device_allocator, + .query_i64 = iree_hal_xrt_lite_device_query_i64, + .create_command_buffer = iree_hal_xrt_lite_device_create_command_buffer, .create_executable_cache = iree_hal_xrt_lite_device_create_executable_cache, - .create_command_buffer = iree_hal_xrt_lite_device_create_command_buffer}; + .create_semaphore = iree_hal_xrt_lite_device_create_semaphore, + .queue_alloca = iree_hal_xrt_lite_device_queue_alloca, + .queue_execute = iree_hal_xrt_lite_device_queue_execute, +}; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h deleted file mode 100644 index c8d2a6e1f..000000000 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_DEVICE_H_ -#define IREE_AMD_AIE_DRIVER_XRT_LITE_DEVICE_H_ - -#include "iree/base/api.h" -#include "iree/hal/api.h" - -// NOTE: nothing in the skeleton implementation. Device creation and adoption is -// part of the public API header. This header can contain internal types and -// functions. - -#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_DEVICE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc new file mode 100644 index 000000000..7c856b88e --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -0,0 +1,371 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/direct_command_buffer.h" + +#include "iree-amd-aie/driver/xrt-lite/buffer.h" +#include "iree-amd-aie/driver/xrt-lite/executable.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h" +#include "iree/hal/utils/resource_set.h" + +// The max number of bindings per descriptor set allowed in the XRT HAL +// implementation. +#define IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT 16 + +// The max number of descriptor sets allowed in the XRT HAL implementation. +// This depends on the general descriptor set planning in IREE and should adjust +// with it. +#define IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_COUNT 4 + +struct iree_hal_xrt_lite_direct_command_buffer { + iree_hal_command_buffer_t base; + iree_allocator_t host_allocator; + // A resource set to maintain references to all resources used within the + // command buffer. Reset on each begin. + iree_hal_resource_set_t* resource_set; + // Staging arena used for host->device transfers. + iree_arena_allocator_t arena; + + std::shared_ptr shim_device; + + struct { + shim_xdna::bo* bindings[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; + // Offset and length are used to get the sub buffer at kernel launch. + iree_device_size_t + offsets[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; + iree_device_size_t + lengths[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; + + } descriptor_sets[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_COUNT]; +}; + +namespace { +extern const iree_hal_command_buffer_vtable_t + iree_hal_xrt_lite_direct_command_buffer_vtable; +} // namespace + +static iree_hal_xrt_lite_direct_command_buffer* +iree_hal_xrt_lite_direct_command_buffer_cast( + iree_hal_command_buffer_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, + &iree_hal_xrt_lite_direct_command_buffer_vtable); + return (iree_hal_xrt_lite_direct_command_buffer*)base_value; +} + +iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( + std::shared_ptr shim_device, + iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer) { + IREE_ASSERT_ARGUMENT(device_allocator); + IREE_ASSERT_ARGUMENT(out_command_buffer); + *out_command_buffer = nullptr; + if (binding_capacity > 0) { + // TODO(#10144): support indirect command buffers with binding tables. + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "indirect command buffers not yet implemented"); + } + + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_direct_command_buffer* command_buffer = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, + iree_allocator_malloc(host_allocator, + sizeof(*command_buffer) + + iree_hal_command_buffer_validation_state_size( + mode, binding_capacity), + (void**)&command_buffer)); + iree_hal_command_buffer_initialize( + device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY, + binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer), + &iree_hal_xrt_lite_direct_command_buffer_vtable, &command_buffer->base); + command_buffer->host_allocator = host_allocator; + command_buffer->shim_device = shim_device; + iree_arena_initialize(block_pool, &command_buffer->arena); + iree_status_t status = + iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set); + if (iree_status_is_ok(status)) { + *out_command_buffer = &command_buffer->base; + } else { + iree_hal_command_buffer_release(&command_buffer->base); + } + + IREE_TRACE_ZONE_END(z0); + + return status; +} +static void iree_hal_xrt_lite_direct_command_buffer_destroy( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_xrt_lite_direct_command_buffer* command_buffer = + iree_hal_xrt_lite_direct_command_buffer_cast(base_command_buffer); + iree_allocator_t host_allocator = command_buffer->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + command_buffer->shim_device.reset(); + iree_hal_resource_set_free(command_buffer->resource_set); + iree_arena_deinitialize(&command_buffer->arena); + iree_allocator_free(host_allocator, command_buffer); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_begin( + iree_hal_command_buffer_t* base_command_buffer) { + // Nothing to do. + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_end( + iree_hal_command_buffer_t* base_command_buffer) { + iree_hal_xrt_lite_direct_command_buffer* command_buffer = + iree_hal_xrt_lite_direct_command_buffer_cast(base_command_buffer); + IREE_TRACE_ZONE_BEGIN(z0); + iree_arena_reset(&command_buffer->arena); + iree_hal_resource_set_free(command_buffer->resource_set); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_resource_set_allocate(command_buffer->arena.block_pool, + &command_buffer->resource_set)); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_direct_command_buffer_begin_debug_group( + iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label, + iree_hal_label_color_t label_color, + const iree_hal_label_location_t* location) { + (void)iree_status_from_code(IREE_STATUS_UNIMPLEMENTED); +} + +static void iree_hal_xrt_lite_direct_command_buffer_end_debug_group( + iree_hal_command_buffer_t* base_command_buffer) { + (void)iree_status_from_code(IREE_STATUS_UNIMPLEMENTED); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_execution_barrier( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_execution_stage_t source_stage_mask, + iree_hal_execution_stage_t target_stage_mask, + iree_hal_execution_barrier_flags_t flags, + iree_host_size_t memory_barrier_count, + const iree_hal_memory_barrier_t* memory_barriers, + iree_host_size_t buffer_barrier_count, + const iree_hal_buffer_barrier_t* buffer_barriers) { + if (iree_any_bit_set(source_stage_mask, IREE_HAL_EXECUTION_STAGE_HOST) || + iree_any_bit_set(target_stage_mask, IREE_HAL_EXECUTION_STAGE_HOST)) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "barrier involving host not yet supported"); + } + + if (flags != IREE_HAL_EXECUTION_BARRIER_FLAG_NONE) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "non-zero barrier flag not yet supported"); + } + + // Nothing to do in current synchronous mode. + + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_signal_event( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event, + iree_hal_execution_stage_t source_stage_mask) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported"); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_reset_event( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event, + iree_hal_execution_stage_t source_stage_mask) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported"); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_wait_events( + iree_hal_command_buffer_t* base_command_buffer, + iree_host_size_t event_count, const iree_hal_event_t** events, + iree_hal_execution_stage_t source_stage_mask, + iree_hal_execution_stage_t target_stage_mask, + iree_host_size_t memory_barrier_count, + const iree_hal_memory_barrier_t* memory_barriers, + iree_host_size_t buffer_barrier_count, + const iree_hal_buffer_barrier_t* buffer_barriers) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported"); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_discard_buffer( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_buffer_ref_t buffer) { + // It is okay to do nothing here. + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_fill_buffer( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_buffer_ref_t target_ref, const void* pattern, + iree_host_size_t pattern_length) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "fill buffer not yet supported"); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_update_buffer( + iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer, + iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) { + IREE_TRACE_ZONE_BEGIN(z0); + const uint8_t* src = (const uint8_t*)source_buffer + source_offset; + + // No need to Allocate scratch space (in an arena) as the memcpy + // used below is expected to be synchronized. + shim_xdna::bo* target_device_buffer = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(target_ref.buffer)); + void* target_device_buffer_ptr = target_device_buffer->map(); + uint8_t* dst = (uint8_t*)target_device_buffer_ptr + + iree_hal_buffer_byte_offset(target_ref.buffer) + + target_ref.offset; + memcpy(dst, src, target_ref.length); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_copy_buffer( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) { + IREE_TRACE_ZONE_BEGIN(z0); + + shim_xdna::bo* target_device_buffer = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(target_ref.buffer)); + void* target_device_buffer_ptr = target_device_buffer->map(); + iree_device_size_t target_offset = + iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset; + + shim_xdna::bo* source_device_buffer = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(source_ref.buffer)); + void* source_device_buffer_ptr = source_device_buffer->map(); + iree_device_size_t source_offset = + iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset; + + uint8_t* dst = (uint8_t*)target_device_buffer_ptr + target_offset; + uint8_t* src = (uint8_t*)source_device_buffer_ptr + source_offset; + memcpy(dst, src, target_ref.length); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_collective( + iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel, + iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref, + iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "collectives not yet supported"); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_executable_t* executable, int32_t entry_point, + const uint32_t workgroup_count[3], iree_const_byte_span_t constants, + iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) { + iree_hal_xrt_lite_direct_command_buffer* command_buffer = + reinterpret_cast( + base_command_buffer); + IREE_TRACE_ZONE_BEGIN(z0); + + // Lookup kernel parameters used for side-channeling additional launch + // information from the compiler. + iree_hal_xrt_lite_kernel_params_t kernel_params; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_native_executable_entry_point_kernel_params( + executable, entry_point, &kernel_params)); + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, + &executable)); + + xrt::xclbin xclbin = xrt::xclbin(kernel_params.xclbinVector); + kernel_params.context = + command_buffer->shim_device->create_hw_context(xclbin); + uint32_t num_instr = flatbuffers_uint32_vec_len(kernel_params.asm_inst); + size_t ctrl_code_size = num_instr * sizeof(uint32_t); + auto bo_ctrl_code = command_buffer->shim_device->alloc_bo( + ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); + uint32_t* instr_buffer = static_cast(bo_ctrl_code->map()); + memcpy(instr_buffer, kernel_params.asm_inst, ctrl_code_size); + bo_ctrl_code->sync(shim_xdna::direction::host2device); + + std::string cu_name = kernel_params.kernel_name; + cu_name += ":IREE"; + shim_xdna::cuidx_t cu_idx = kernel_params.context->open_cu_context(cu_name); + + shim_xdna::exec_buf ebuf(command_buffer->shim_device->get_pdev(), + ERT_START_CU); + ebuf.set_cu_idx(cu_idx); + unsigned int opcode = 3; + ebuf.add_arg_64(opcode); + ebuf.add_arg_bo(*bo_ctrl_code); + ebuf.add_arg_32(num_instr); + for (iree_host_size_t j = 0; j < bindings.count; ++j) { + shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + ebuf.add_arg_bo(*bo); + } + + for (iree_host_size_t j = 0; j < bindings.count; ++j) { + shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + bo->sync(shim_xdna::direction::host2device); + } + shim_xdna::hw_q* hwq = kernel_params.context->get_hw_queue(); + hwq->issue_command(ebuf.get_exec_buf_bo()); + hwq->wait_command(ebuf.get_exec_buf_bo(), 0); + + for (iree_host_size_t j = 0; j < bindings.count; ++j) { + shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + bo->sync(shim_xdna::direction::device2host); + } + + for (iree_host_size_t j = 0; j < bindings.count; ++j) { + shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + } + + IREE_TRACE_ZONE_END(z0); + + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch_indirect( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_executable_t* executable, int32_t entry_point, + iree_hal_buffer_ref_t workgroups_ref, iree_const_byte_span_t constants, + iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "need xrt implementation of dispatch indirect"); +} + +namespace { +const iree_hal_command_buffer_vtable_t + iree_hal_xrt_lite_direct_command_buffer_vtable = { + .destroy = iree_hal_xrt_lite_direct_command_buffer_destroy, + .begin = iree_hal_xrt_lite_direct_command_buffer_begin, + .end = iree_hal_xrt_lite_direct_command_buffer_end, + .execution_barrier = + iree_hal_xrt_lite_direct_command_buffer_execution_barrier, + .signal_event = iree_hal_xrt_lite_direct_command_buffer_signal_event, + .reset_event = iree_hal_xrt_lite_direct_command_buffer_reset_event, + .wait_events = iree_hal_xrt_lite_direct_command_buffer_wait_events, + .discard_buffer = + iree_hal_xrt_lite_direct_command_buffer_discard_buffer, + .fill_buffer = iree_hal_xrt_lite_direct_command_buffer_fill_buffer, + .update_buffer = iree_hal_xrt_lite_direct_command_buffer_update_buffer, + .copy_buffer = iree_hal_xrt_lite_direct_command_buffer_copy_buffer, + .collective = iree_hal_xrt_lite_direct_command_buffer_collective, + .dispatch = iree_hal_xrt_lite_direct_command_buffer_dispatch, + .dispatch_indirect = + iree_hal_xrt_lite_direct_command_buffer_dispatch_indirect, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h new file mode 100644 index 000000000..91eb4aece --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -0,0 +1,32 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/internal/arena.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// |out_command_buffer| must be released by the caller (see +// iree_hal_command_buffer_release). +iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( + std::shared_ptr shim_device, + iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index fdb48b0d8..2e9c46ec1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -161,7 +161,9 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( executable->entry_point_count = entry_point_count; for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; entry_ordinal++) { - const char* entry_name = + iree_hal_xrt_lite_kernel_params_t* params = + &executable->entry_points[entry_ordinal]; + params->kernel_name = flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); uint32_t xclbin_index = flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); @@ -170,38 +172,33 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( flatbuffers_string_t xclbin_fb = iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def); - iree_hal_xrt_lite_kernel_params_t* params = - &executable->entry_points[entry_ordinal]; - // XRT API needs this vector and cant actually read a void*. std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); - xrt::xclbin xclbin = xrt::xclbin(xclbinVector); - params->context = shim_device->create_hw_context(xclbin); + params->xclbinVector = xclbinVector; +// xrt::xclbin xclbin = xrt::xclbin(xclbinVector); +// params->context = shim_device->create_hw_context(xclbin); uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def = iree_amd_aie_hal_xrt_AsmInstDef_vec_at(asm_instrs_vec, asm_instr_index); - flatbuffers_uint32_vec_t asm_inst = + params->asm_inst = iree_amd_aie_hal_xrt_AsmInstDef_asm_inst_get(asminst_def); - uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); - size_t ctrl_code_size = num_instr * sizeof(uint32_t); - params->bo_ctrl_code = - shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); - uint32_t* instr_buffer = - static_cast(params->bo_ctrl_code->map()); - memcpy(instr_buffer, asm_inst, ctrl_code_size); - params->num_instr = num_instr; +// uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); +// size_t ctrl_code_size = num_instr * sizeof(uint32_t); +// params->bo_ctrl_code = +// shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); +// uint32_t* instr_buffer = +// static_cast(params->bo_ctrl_code->map()); +// memcpy(instr_buffer, asm_inst, ctrl_code_size); // Stash the entry point name in the string table for use when tracing. IREE_TRACE({ - iree_host_size_t entry_name_length = flatbuffers_string_len(entry_name); - memcpy(string_table_buffer, entry_name, entry_name_length); - params->kernel_name = - iree_make_string_view(string_table_buffer, entry_name_length); - string_table_buffer += entry_name_length; + memcpy(string_table_buffer, params->kernel_name.data(), + params->kernel_name.size()); + string_table_buffer += params->kernel_name.size(); }); IREE_TRACE({ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index b70c266cc..ee57055e4 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -9,6 +9,7 @@ #include +#include "flatbuffers_common_reader.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h" @@ -24,9 +25,10 @@ extern "C" { struct iree_hal_xrt_lite_kernel_params_t { std::unique_ptr context; std::unique_ptr bo_ctrl_code; + std::vector xclbinVector; + flatbuffers_uint32_vec_t asm_inst; // Number of assembly instructions argument to the kernel - uint32_t num_instr; // number of instructions - IREE_TRACE(iree_string_view_t kernel_name;) + std::string kernel_name; IREE_TRACE(iree_string_view_t source_filename;) IREE_TRACE(uint32_t source_line;) }; @@ -44,7 +46,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( iree_hal_xrt_lite_kernel_params_t* out_params); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc index 2753eebb7..8a617f977 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -62,6 +62,7 @@ static void iree_hal_xrt_lite_nop_executable_cache_destroy( iree_hal_xrt_lite_nop_executable_cache_cast(base_executable_cache); IREE_TRACE_ZONE_BEGIN(z0); + executable_cache->shim_device.reset(); iree_allocator_free(executable_cache->host_allocator, executable_cache); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc new file mode 100644 index 000000000..17810350f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc @@ -0,0 +1,115 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" + +#include + +#include "iree/base/api.h" +#include "iree/hal/utils/semaphore_base.h" + +struct iree_hal_xrt_lite_semaphore_t { + iree_hal_semaphore_t base; + iree_atomic_int64_t value; + iree_allocator_t host_allocator; +}; + +namespace { +extern const iree_hal_semaphore_vtable_t iree_hal_xrt_lite_semaphore_vtable; +} // namespace + +static iree_hal_xrt_lite_semaphore_t* iree_hal_xrt_lite_semaphore_cast( + iree_hal_semaphore_t* base_value) { + IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_semaphore_vtable); + return (iree_hal_xrt_lite_semaphore_t*)base_value; +} + +iree_status_t iree_hal_xrt_lite_semaphore_create( + iree_allocator_t host_allocator, uint64_t initial_value, + iree_hal_semaphore_t** out_semaphore) { + IREE_ASSERT_ARGUMENT(out_semaphore); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_semaphore_t* semaphore = nullptr; + iree_status_t status = iree_allocator_malloc( + host_allocator, sizeof(*semaphore), (void**)&semaphore); + if (iree_status_is_ok(status)) { + iree_hal_semaphore_initialize(&iree_hal_xrt_lite_semaphore_vtable, + &semaphore->base); + semaphore->host_allocator = host_allocator; + iree_atomic_store_int64(&semaphore->value, initial_value, + iree_memory_order_release); + *out_semaphore = &semaphore->base; + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_xrt_lite_semaphore_destroy( + iree_hal_semaphore_t* base_semaphore) { + iree_hal_xrt_lite_semaphore_t* semaphore = + iree_hal_xrt_lite_semaphore_cast(base_semaphore); + iree_allocator_t host_allocator = semaphore->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_semaphore_deinitialize(&semaphore->base); + iree_allocator_free(host_allocator, semaphore); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_semaphore_query( + iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) { + iree_hal_xrt_lite_semaphore_t* semaphore = + iree_hal_xrt_lite_semaphore_cast(base_semaphore); + // TODO: Support semaphores completely. + *out_value = + iree_atomic_load_int64(&semaphore->value, iree_memory_order_acquire); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_semaphore_signal( + iree_hal_semaphore_t* base_semaphore, uint64_t new_value) { + iree_hal_xrt_lite_semaphore_t* semaphore = + iree_hal_xrt_lite_semaphore_cast(base_semaphore); + // TODO: Support semaphores completely. Return OK currently as everything is + // synchronized for each submit to allow things to run. + iree_atomic_store_int64(&semaphore->value, new_value, + iree_memory_order_release); + iree_hal_semaphore_poll(&semaphore->base); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_semaphore_fail( + iree_hal_semaphore_t* base_semaphore, iree_status_t status) { + iree_hal_xrt_lite_semaphore_t* semaphore = + iree_hal_xrt_lite_semaphore_cast(base_semaphore); + // TODO: save status and mark timepoint as failed. + iree_status_ignore(status); + iree_hal_semaphore_poll(&semaphore->base); +} + +static iree_status_t iree_hal_xrt_lite_semaphore_wait( + iree_hal_semaphore_t* base_semaphore, uint64_t value, + iree_timeout_t timeout) { + iree_hal_xrt_lite_semaphore_t* semaphore = + iree_hal_xrt_lite_semaphore_cast(base_semaphore); + // TODO: Support semaphores completely. Return OK currently as everything is + // synchronized for each submit to allow things to run. + iree_hal_semaphore_poll(&semaphore->base); + return iree_ok_status(); +} + +namespace { +const iree_hal_semaphore_vtable_t iree_hal_xrt_lite_semaphore_vtable = { + /*.destroy = */ iree_hal_xrt_lite_semaphore_destroy, + /*.query = */ iree_hal_xrt_lite_semaphore_query, + /*.signal = */ iree_hal_xrt_lite_semaphore_signal, + /*.fail = */ iree_hal_xrt_lite_semaphore_fail, + /*.wait = */ iree_hal_xrt_lite_semaphore_wait, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h new file mode 100644 index 000000000..0a8623863 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h @@ -0,0 +1,27 @@ +// Copyright 2021 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ + +#include + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +iree_status_t iree_hal_xrt_lite_semaphore_create( + iree_allocator_t host_allocator, uint64_t initial_value, + iree_hal_semaphore_t** out_semaphore); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index cc349197c..4cb322881 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -270,6 +270,12 @@ bo::bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags) shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); } +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, uint32_t flags) + : bo(p, ctx_id, size, shim_xcl_bo_flags{.flags = flags}) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, amdxdna_bo_type type) : m_pdev(pdev), @@ -430,6 +436,8 @@ void bo::sync(direction dir, size_t size, size_t offset) { } } +void bo::sync(direction dir) { sync(dir, size(), 0); } + void bo::bind_at(size_t pos, const bo &boh, size_t offset, size_t size) { std::lock_guard lg(m_args_map_lock); @@ -471,10 +479,12 @@ uint32_t bo::get_arg_bo_handles(uint32_t *handles, size_t num) const { return sz; } -exec_buf::exec_buf(bo &bo_execbuf, uint32_t op) - : m_exec_buf_bo(bo_execbuf), - m_cmd_pkt(reinterpret_cast(bo_execbuf.map())), - m_cmd_size(bo_execbuf.size()), +exec_buf::exec_buf(const pdev &p, uint32_t op) + : m_exec_buf_bo(std::make_unique(p, AMDXDNA_INVALID_CTX_HANDLE, + MAX_EXEC_BO_SIZE, + XCL_BO_FLAGS_EXECBUF)), + m_cmd_pkt(reinterpret_cast(m_exec_buf_bo->map())), + m_cmd_size(m_exec_buf_bo->size()), m_op(op), m_arg_cnt(0), m_reg_idx(0) { @@ -498,7 +508,7 @@ void exec_buf::set_cu_idx(cuidx_t cu_idx) { void exec_buf::add_ctrl_bo(bo &bo_ctrl) { ert_start_kernel_cmd *cmd_packet = - reinterpret_cast(m_exec_buf_bo.map()); + reinterpret_cast(m_exec_buf_bo->map()); switch (m_op) { case ERT_START_CU: break; @@ -541,7 +551,7 @@ void exec_buf::add_arg_64(uint64_t val) { void exec_buf::add_arg_bo(bo &bo_arg, std::string arg_name) { // Add to argument list for driver - m_exec_buf_bo.bind_at(m_arg_cnt, bo_arg, 0, bo_arg.size()); + m_exec_buf_bo->bind_at(m_arg_cnt, bo_arg, 0, bo_arg.size()); // Add to argument list for control code patching if (arg_name.empty()) m_patching_args.emplace_back(std::to_string(m_arg_cnt), bo_arg.get_paddr()); @@ -553,7 +563,7 @@ void exec_buf::add_arg_bo(bo &bo_arg, std::string arg_name) { void exec_buf::dump() { std::cout << "Dumping exec buf:"; - int *data = static_cast(m_exec_buf_bo.map()); + int *data = static_cast(m_exec_buf_bo->map()); std::cout << std::hex; for (int i = 0; i < m_cmd_pkt->count + 1; i++) { if (i % 4 == 0) std::cout << "\n"; @@ -574,4 +584,7 @@ void exec_buf::inc_pkt_count(uint32_t n) { throw std::runtime_error("Size of exec buf too small: " + std::to_string(m_cmd_size)); } + +bo *exec_buf::get_exec_buf_bo() { return m_exec_buf_bo.get(); } + } // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h index 617e9335a..16d01fe8c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -13,6 +13,8 @@ namespace shim_xdna { +#define MAX_EXEC_BO_SIZE 4096 + enum xclBOSyncDirection { XCL_BO_SYNC_BO_TO_DEVICE = 0, XCL_BO_SYNC_BO_FROM_DEVICE, @@ -68,6 +70,7 @@ struct bo { bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, amdxdna_bo_type type); bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags); + bo(const pdev &p, uint32_t ctx_id, size_t size, uint32_t flags); bo(const pdev &p, int ehdl); // Support BO creation from internal bo(const pdev &p, size_t size, amdxdna_bo_type type); @@ -76,6 +79,7 @@ struct bo { void *map() const; void unmap(void *addr); void sync(direction, size_t size, size_t offset); + void sync(direction); properties get_properties() const; size_t size(); @@ -104,7 +108,7 @@ struct bo { }; struct exec_buf { - bo &m_exec_buf_bo; + std::unique_ptr m_exec_buf_bo; ert_start_kernel_cmd *m_cmd_pkt; size_t m_cmd_size; uint32_t m_op; @@ -112,16 +116,17 @@ struct exec_buf { uint32_t m_reg_idx; std::vector > m_patching_args; - exec_buf(bo &bo_execbuf, uint32_t op); + exec_buf(const pdev &p, uint32_t op); + static void set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx); void set_cu_idx(cuidx_t cu_idx); + bo* get_exec_buf_bo(); + void add_ctrl_bo(bo &bo_ctrl); void add_arg_32(uint32_t val); void add_arg_64(uint64_t val); void add_arg_bo(bo &bo_arg, std::string arg_name = ""); void dump(); - static size_t get_ctrl_code_size(const std::string &elf_path); - void patch_ctrl_code(bo &bo_ctrl, const std::string &elf_path); void inc_pkt_count(uint32_t n); }; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 0e521e4f2..3b1ddb73a 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -169,9 +169,8 @@ std::unique_ptr device::alloc_bo(size_t size, shim_xcl_bo_flags flags) { } std::unique_ptr device::alloc_bo(size_t size, uint32_t flags) { - shim_xcl_bo_flags f{}; - f.flags = flags; - return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, f); + return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, + shim_xcl_bo_flags{.flags = flags}); } std::unique_ptr device::import_bo(pid_t pid, int ehdl) { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index 2deefa14b..574fc8a20 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -85,6 +85,7 @@ hw_ctx::~hw_ctx() { cuidx_t hw_ctx::open_cu_context(const std::string &cu_name) { for (uint32_t i = 0; i < m_cu_info.size(); i++) { auto &ci = m_cu_info[i]; + shim_debug("ci.m_name %s\n", ci.m_name.c_str()); if (ci.m_name == cu_name) return cuidx_t{.index = i}; } diff --git a/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt index 81f90689b..9d9cabd44 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt @@ -38,6 +38,7 @@ iree_cc_library( "native_executable.h" "native_executable.cc" "nop_semaphore.cc" + "nop_semaphore.h" "nop_executable_cache.h" "nop_executable_cache.cc" DEPS @@ -48,6 +49,7 @@ iree_cc_library( iree::base::internal::flatcc::parsing iree::hal::utils::deferred_command_buffer iree::hal::utils::file_transfer + iree::hal::utils::semaphore_base iree::hal iree-amd-aie::schemas::xrt_executable_def_c_fbs # hide the target from all exports so it doesn't need to be installed diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt new file mode 100644 index 000000000..07746787d --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt @@ -0,0 +1,111 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +include(CMakeDependentOption) + +iree_hal_cts_test_suite( + DRIVER_NAME + xrt + DRIVER_REGISTRATION_HDR + "iree-amd-aie/driver/xrt/registration/driver_module.h" + DRIVER_REGISTRATION_FN + "iree_hal_xrt_driver_module_register" + COMPILER_TARGET_BACKEND + "amd-aie" + EXECUTABLE_FORMAT + "\"amdaie-xclbin-fb\"" + DEPS + iree-amd-aie::driver::xrt::registration + INCLUDED_TESTS + "allocator" + "buffer_mapping" + "driver" +) + +set(PEANO_INSTALL_DIR "" CACHE PATH "") +set(VITIS_DIR "" CACHE PATH "") +if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) + message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") +endif() +cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") +set(TARGET_DEVICE "npu1_4col" CACHE STRING "") + +iree_bytecode_module( + NAME + xrt_executable_cache_test_module + MODULE_FILE_NAME + xrt_executable_cache_test.bin + SRC + "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" + FLAGS + --compile-mode=hal-executable + --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} + --iree-hal-target-backends=amd-aie + --iree-amdaie-lower-to-aie-pipeline=air + --iree-amdaie-target-device=${TARGET_DEVICE} + --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} + --iree-amd-aie-vitis-install-dir=${VITIS_DIR} + --iree-amd-aie-enable-chess=$ + --iree-amd-aie-show-invoked-commands + --iree-hal-memoization=false + --iree-hal-indirect-command-buffers=false + DEPS + iree-aie-xclbinutil + PUBLIC + TESTONLY +) + +iree_c_embed_data( + NAME + xrt_executables_c + SRCS + xrt_executable_cache_test.bin + C_FILE_OUTPUT + xrt_executables_c.c + H_FILE_OUTPUT + xrt_executables_c.h + IDENTIFIER + iree_cts_testdata_executables_aie_xrt + STRIP_PREFIX + xrt_ + DEPENDS + ::xrt_executable_cache_test_module + FLATTEN + PUBLIC + TESTONLY +) + +iree_cc_test( + NAME + xrt_executable_cache_test + SRCS + executable_cache_test.cc + DEPS + ::xrt_executables_c + iree-amd-aie::driver::xrt::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main +) + +iree_cc_test( + NAME + xrt_dispatch_test + SRCS + matmul_dispatch_test.cc + DEPS + ::xrt_executables_c + iree-amd-aie::driver::xrt::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main + iree::tools::testing::e2e::e2e_test_util +) + +target_include_directories(iree-amd-aie_driver_xrt_cts_xrt_executable_cache_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") +target_include_directories(iree-amd-aie_driver_xrt_cts_xrt_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc new file mode 100644 index 000000000..3e9411cf2 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc @@ -0,0 +1,85 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "xrt_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class ExecutableCacheTest : public CTSTestBase<> {}; + +TEST_F(ExecutableCacheTest, Create) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, CantPrepareUnknownFormat) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + EXPECT_FALSE(iree_hal_executable_cache_can_prepare_format( + executable_cache, /*caching_mode=*/0, iree_make_cstring_view("FOO?"))); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, PrepareExecutable) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("executable_cache_test.bin")); + + iree_hal_executable_t* executable = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache, &executable_params, &executable)); + + iree_hal_executable_release(executable); + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir new file mode 100644 index 000000000..dedbcab6b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir @@ -0,0 +1,33 @@ +// bootstrapped from https://github.com/nod-ai/iree-amd-aie/blob/9c4c167baf89a279888fba8db75907845946077c/tests/samples/matmul_pack_peel_objectfifo_e2e.mlir + +#pipeline_layout = #hal.pipeline.layout< + bindings = [ + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding + ], + flags = Indirect +> +hal.executable.source public @amdaie_fb { + hal.executable.export public @matmul_f32_dispatch_0_matmul_32x32x32_f32 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_f32_dispatch_0_matmul_32x32x32_f32() { + %c0_f32 = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %5 = tensor.empty() : tensor<32x32xf32> + %6 = linalg.fill ins(%c0_f32 : f32) outs(%5 : tensor<32x32xf32>) -> tensor<32x32xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xf32> -> !flow.dispatch.tensor> + return + } + } +} diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc new file mode 100644 index 000000000..c48ea13f7 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc @@ -0,0 +1,224 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/buffer_view_util.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "tools/testing/e2e/test_utils.h" +#include "xrt_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class MatMulDispatchTest + : public CTSTestBase<::testing::TestWithParam> { + protected: + void PrepareMatmulExecutable() { + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status_), &executable_cache_)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("xrt_executable_cache_test.bin")); + + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache_, &executable_params, &executable_)); + } + + void CleanupExecutable() { + iree_hal_executable_release(executable_); + iree_hal_executable_cache_release(executable_cache_); + IREE_ASSERT_OK(loop_status_); + } + + iree_status_t loop_status_ = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache_ = nullptr; + iree_hal_executable_t* executable_ = nullptr; +}; + +int32_t generate_random_number(iree_hal_element_type_t element_type, + int32_t seed) { + int32_t min = 0; + int32_t max = 0; + iree_test_utils_get_min_max_for_element_type(element_type, &min, &max); + uint32_t range = (max - min + 1); + return (int32_t)iree_test_utils_pseudorandom_range( + reinterpret_cast(&seed), range) + + min; +} + +TEST_F(MatMulDispatchTest, Create) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + EXPECT_TRUE((iree_hal_command_buffer_allowed_categories(command_buffer) & + IREE_HAL_COMMAND_CATEGORY_DISPATCH) == + IREE_HAL_COMMAND_CATEGORY_DISPATCH); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(MatMulDispatchTest, BeginEnd) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(MatMulDispatchTest, SubmitEmpty) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_P(MatMulDispatchTest, DispatchMatmul) { + PrepareMatmulExecutable(); + + // Create input buffer. + constexpr iree_device_size_t WIDTH = 32; + constexpr iree_device_size_t M = WIDTH, K = WIDTH, N = WIDTH; + iree_hal_buffer_t *input_A = nullptr, *input_B = nullptr, *output_C = nullptr; + int32_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count() >> + 32; + int32_t a = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed); + int32_t b = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed + 1); + CreateFilledDeviceBuffer(M * K * sizeof(float), a, &input_A); + CreateFilledDeviceBuffer(K * N * sizeof(float), b, &input_B); + CreateFilledDeviceBuffer(M * N * sizeof(float), -1, &output_C); + + iree_hal_buffer_ref_t binding_refs[3]; + iree_hal_buffer_binding_table_t binding_table = + iree_hal_buffer_binding_table_empty(); + binding_refs[0] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_A, + /*offset=*/0, + /*length=*/M * K * sizeof(float), + }; + binding_refs[1] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_B, + /*offset=*/0, + /*length=*/K * N * sizeof(float), + }; + binding_refs[2] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/output_C, + /*offset=*/0, + /*length=*/M * N * sizeof(float), + }; + iree_hal_buffer_ref_list_t bindings = { + /*.count=*/IREE_ARRAYSIZE(binding_refs), + /*.values=*/binding_refs, + }; + + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + binding_table.count, &command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + + uint32_t workgroup_count[3] = {1, 1, 1}; + IREE_ASSERT_OK(iree_hal_command_buffer_dispatch( + command_buffer, executable_, /*entry_point=*/0, workgroup_count, + iree_const_byte_span_empty(), bindings, IREE_HAL_DISPATCH_FLAG_NONE)); + + IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier( + command_buffer, + /*source_stage_mask=*/IREE_HAL_EXECUTION_STAGE_DISPATCH | + IREE_HAL_EXECUTION_STAGE_TRANSFER | + IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, + /*target_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE | + IREE_HAL_EXECUTION_STAGE_DISPATCH | IREE_HAL_EXECUTION_STAGE_TRANSFER, + IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0, + /*memory_barriers=*/nullptr, + /*buffer_barrier_count=*/0, /*buffer_barriers=*/nullptr)); + + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer, binding_table)); + + std::vector output_values; + output_values.reserve(M * N); + IREE_ASSERT_OK(iree_hal_device_transfer_d2h( + device_, output_C, + /*source_offset=*/0, output_values.data(), M * N * sizeof(float), + IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout())); + std::vector correct_output_values; + correct_output_values.reserve(M * N); + std::fill_n(correct_output_values.data(), M * N, (float)WIDTH * (a * b)); + int n_wrong = 0; + for (int i = 0; i < M * N; ++i) { + if (output_values[i] != correct_output_values[i]) { + std::cout << "wrong @ i:" << i << ", " << output_values[i] + << " != " << correct_output_values[i] << "\n"; + n_wrong += 1; + } + } + EXPECT_EQ(n_wrong, 0); + + iree_hal_command_buffer_release(command_buffer); + iree_hal_buffer_release(output_C); + iree_hal_buffer_release(input_B); + iree_hal_buffer_release(input_A); + CleanupExecutable(); +} + +INSTANTIATE_TEST_SUITE_P(MatMulDispatchTest, MatMulDispatchTest, + ::testing::Values(RecordingType::kDirect), + GenerateTestName()); + +} // namespace iree::hal::cts From 1494af7d4f4a7e1a95bce265d1af163855a703e3 Mon Sep 17 00:00:00 2001 From: makslevental Date: Fri, 11 Oct 2024 20:26:58 -0400 Subject: [PATCH 11/35] hack tests for xrt-lite --- .github/workflows/ci-windows.yml | 2 + build_tools/build_test_cpp.sh | 8 +- build_tools/ci/cpu_comparison/run.py | 3 +- build_tools/ci/run_matmul_test.sh | 4 +- .../iree-amd-aie/PluginRegistration.cpp | 4 +- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 4 +- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 10 +- .../iree-amd-aie/driver/xrt-lite/allocator.h | 2 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 6 +- .../driver/xrt-lite/direct_command_buffer.cc | 37 ++---- .../driver/xrt-lite/direct_command_buffer.h | 2 +- .../driver/xrt-lite/executable.cc | 20 +-- .../iree-amd-aie/driver/xrt-lite/executable.h | 2 +- .../driver/xrt-lite/nop_executable_cache.cc | 7 +- .../driver/xrt-lite/nop_executable_cache.h | 2 +- .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 2 + .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 108 --------------- .../driver/xrt-lite/shim/linux/kmq/bo.h | 25 ---- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 2 +- .../driver/xrt-lite/shim/linux/kmq/kernel.cpp | 125 ++++++++++++++++++ .../driver/xrt-lite/shim/linux/kmq/kernel.h | 35 +++++ tests/conftest.py | 3 +- 22 files changed, 211 insertions(+), 202 deletions(-) create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 2d7a6e86e..4a2c06a2e 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -103,6 +103,8 @@ jobs: # Remove-Item -Path "$pwd\llvm-build" -Force $env:llvm_install_dir = "$pwd\llvm-install" echo $env:llvm_install_dir + .\build_tools\download_peano.ps1 + $env:peano_install_dir = "$pwd\llvm-aie" .\build_tools.\build_test_cpp.ps1 - name: Create artifacts diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index 612f5999a..e45e3cdf4 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -109,7 +109,7 @@ if [[ "$OSTYPE" == "linux"* ]]; then -DCMAKE_CXX_COMPILER="${CXX}" -DLLVM_TARGET_ARCH=X86 -DLLVM_TARGETS_TO_BUILD=X86 - -DIREE_EXTERNAL_HAL_DRIVERS="xrt;xrt-lite" + -DIREE_EXTERNAL_HAL_DRIVERS="xrt-lite" -S "$iree_dir" -B @@ -155,5 +155,7 @@ if [ -d "$llvm_install_dir" ]; then fi cp "$build_dir/tools/testing/e2e/iree-e2e-matmul-test" "$install_dir/bin" -mkdir -p "$install_dir/device_tests" -cp "$build_dir"/runtime/plugins/AMD-AIE/iree-amd-aie/driver/xrt-lite/cts/*test "$install_dir/device_tests" +if [[ "$OSTYPE" == "linux"* ]]; then + mkdir -p "$install_dir/device_tests" + cp "$build_dir"/runtime/plugins/AMD-AIE/iree-amd-aie/driver/xrt-lite/cts/*test "$install_dir/device_tests" +fi diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index fdacf7cbc..5875bce25 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -146,6 +146,7 @@ def generate_aie_vmfb( f"--iree-amd-aie-install-dir={config.iree_install_dir}", f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}", f"--iree-hal-dump-executable-files-to={config.output_dir}", + "--iree-amdaie-device-hal=xrt-lite", "--iree-scheduling-optimize-bindings=false", "--iree-hal-memoization=false", "--iree-hal-indirect-command-buffers=false", @@ -191,7 +192,7 @@ def generate_aie_output(config, aie_vmfb, input_args, function_name, name, outpu config.iree_run_exe, f"--module={aie_vmfb}", *input_args, - "--device=xrt", + "--device=xrt-lite", f"--output=@{aie_bin}", ] if function_name: diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 50649e8fd..d6170df29 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -176,7 +176,7 @@ function run_matmul_test() { local target_device="npu1_4col" - local device="xrt" + local device="xrt-lite" local peano_install_path="${PEANO}" @@ -530,7 +530,7 @@ run_matmul_test \ --acc_type "f32" \ --target_backend "amd-aie" \ --target_device "npu1_4col" \ - --device "xrt" \ + --device "xrt-lite" \ --peano_install_path "${PEANO}" \ --amd_aie_install_path "${IREE_INSTALL_DIR}" \ --vitis_path "${VITIS}" \ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp index 39c044d59..8a3903e81 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp @@ -46,9 +46,9 @@ struct AMDAIESession } void populateHALTargetDevices(IREE::HAL::TargetDeviceList &targets) override { - // #hal.device.target<"xrt", ... + // #hal.device.target<"xrt-lite", ... // #hal.executable.target<"amd-aie", ... - targets.add("xrt", [=]() { return AMDAIE::createTarget(options); }); + targets.add("xrt-lite", [=]() { return AMDAIE::createTarget(options); }); } void populateHALTargetBackends( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index 2c643d39b..7a03142c9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -101,7 +101,7 @@ class AIETargetDevice final : public IREE::HAL::TargetDevice { targetRegistry.getTargetBackend("amd-aie")->getDefaultExecutableTargets( context, "amd-aie", configAttr, executableTargetAttrs); - return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("xrt"), + return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("xrt-lite"), configAttr, executableTargetAttrs); } @@ -113,7 +113,7 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { public: explicit AIETargetBackend(const AMDAIEOptions &options) : options(options) {} - std::string getLegacyDefaultDeviceID() const override { return "xrt"; } + std::string getLegacyDefaultDeviceID() const override { return "xrt-lite"; } void getDefaultExecutableTargets( MLIRContext *context, StringRef deviceID, DictionaryAttr deviceConfigAttr, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 2211d2103..7280035d7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -24,11 +24,11 @@ extern const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable; struct iree_hal_xrt_lite_allocator { iree_hal_resource_t resource; iree_allocator_t host_allocator; - std::shared_ptr shim_device; + shim_xdna::device* shim_device; IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) iree_hal_xrt_lite_allocator(iree_allocator_t host_allocator, - std::shared_ptr shim_device) + shim_xdna::device* shim_device) : host_allocator(host_allocator), shim_device(shim_device) { IREE_TRACE_ZONE_BEGIN(z0); iree_hal_resource_initialize(&iree_hal_xrt_lite_allocator_vtable, @@ -266,7 +266,7 @@ static iree_hal_xrt_lite_allocator* iree_hal_xrt_lite_allocator_cast( } iree_status_t iree_hal_xrt_lite_allocator_create( - iree_allocator_t host_allocator, std::shared_ptr device, + iree_allocator_t host_allocator, shim_xdna::device* device, iree_hal_allocator_t** out_allocator) { IREE_ASSERT_ARGUMENT(out_allocator); IREE_TRACE_ZONE_BEGIN(z0); @@ -296,11 +296,7 @@ static void iree_hal_xrt_lite_allocator_destroy( iree_hal_xrt_lite_allocator_cast(base_allocator); IREE_TRACE_ZONE_BEGIN(z0); - // TODO(max): shouldn't this be happening automatically via the refcounting - // (or just the dtor of device?) - allocator->shim_device.reset(); iree_hal_resource_release(&allocator->resource); - // something's not happening here? iree_allocator_free(allocator->host_allocator, allocator); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h index 630bcdab3..062ba6505 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h @@ -13,7 +13,7 @@ // Creates a buffer allocator used for persistent allocations. iree_status_t iree_hal_xrt_lite_allocator_create( - iree_allocator_t host_allocator, std::shared_ptr device, + iree_allocator_t host_allocator, shim_xdna::device* device, iree_hal_allocator_t** out_allocator); #endif // IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 8ee3a3085..efa301dbc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -26,7 +26,7 @@ struct iree_hal_xrt_lite_device { // Block pool used for command buffers with a larger block size (as command // buffers can contain inlined data uploads). iree_arena_block_pool_t block_pool; - std::shared_ptr shim_device; + shim_xdna::device* shim_device; iree_status_t create_executable_cache( iree_string_view_t identifier, iree_loop_t loop, @@ -162,7 +162,7 @@ iree_status_t iree_hal_xrt_lite_device_create( identifier, &device->identifier, reinterpret_cast(device) + total_size - identifier.size); device->host_allocator = host_allocator; - device->shim_device = std::make_shared(); + device->shim_device = new shim_xdna::device; // TODO(null): pass device handles and pool configuration to the allocator. // Some implementations may share allocators across multiple devices created @@ -205,7 +205,7 @@ static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { // and joined first. iree_hal_allocator_release(device->device_allocator); - device->shim_device.reset(); + delete device->shim_device; iree_allocator_free(host_allocator, device); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index 7c856b88e..1a4123877 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -9,6 +9,7 @@ #include "iree-amd-aie/driver/xrt-lite/buffer.h" #include "iree-amd-aie/driver/xrt-lite/executable.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h" #include "iree/hal/utils/resource_set.h" // The max number of bindings per descriptor set allowed in the XRT HAL @@ -29,7 +30,7 @@ struct iree_hal_xrt_lite_direct_command_buffer { // Staging arena used for host->device transfers. iree_arena_allocator_t arena; - std::shared_ptr shim_device; + shim_xdna::device* shim_device; struct { shim_xdna::bo* bindings[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; @@ -52,12 +53,12 @@ iree_hal_xrt_lite_direct_command_buffer_cast( iree_hal_command_buffer_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_direct_command_buffer_vtable); - return (iree_hal_xrt_lite_direct_command_buffer*)base_value; + return reinterpret_cast(base_value); } iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - std::shared_ptr shim_device, - iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, + shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, + iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, @@ -106,7 +107,7 @@ static void iree_hal_xrt_lite_direct_command_buffer_destroy( iree_hal_xrt_lite_direct_command_buffer_cast(base_command_buffer); iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - command_buffer->shim_device.reset(); + iree_hal_resource_set_free(command_buffer->resource_set); iree_arena_deinitialize(&command_buffer->arena); iree_allocator_free(host_allocator, command_buffer); @@ -135,18 +136,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_end( return iree_ok_status(); } -static void iree_hal_xrt_lite_direct_command_buffer_begin_debug_group( - iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label, - iree_hal_label_color_t label_color, - const iree_hal_label_location_t* location) { - (void)iree_status_from_code(IREE_STATUS_UNIMPLEMENTED); -} - -static void iree_hal_xrt_lite_direct_command_buffer_end_debug_group( - iree_hal_command_buffer_t* base_command_buffer) { - (void)iree_status_from_code(IREE_STATUS_UNIMPLEMENTED); -} - static iree_status_t iree_hal_xrt_lite_direct_command_buffer_execution_barrier( iree_hal_command_buffer_t* base_command_buffer, iree_hal_execution_stage_t source_stage_mask, @@ -300,8 +289,7 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( cu_name += ":IREE"; shim_xdna::cuidx_t cu_idx = kernel_params.context->open_cu_context(cu_name); - shim_xdna::exec_buf ebuf(command_buffer->shim_device->get_pdev(), - ERT_START_CU); + shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); ebuf.set_cu_idx(cu_idx); unsigned int opcode = 3; ebuf.add_arg_64(opcode); @@ -311,13 +299,9 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); ebuf.add_arg_bo(*bo); - } - - for (iree_host_size_t j = 0; j < bindings.count; ++j) { - shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( - iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); bo->sync(shim_xdna::direction::host2device); } + shim_xdna::hw_q* hwq = kernel_params.context->get_hw_queue(); hwq->issue_command(ebuf.get_exec_buf_bo()); hwq->wait_command(ebuf.get_exec_buf_bo(), 0); @@ -328,11 +312,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( bo->sync(shim_xdna::direction::device2host); } - for (iree_host_size_t j = 0; j < bindings.count; ++j) { - shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( - iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); - } - IREE_TRACE_ZONE_END(z0); return iree_ok_status(); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h index 91eb4aece..6aebaa624 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -18,7 +18,7 @@ extern "C" { // |out_command_buffer| must be released by the caller (see // iree_hal_command_buffer_release). iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - std::shared_ptr shim_device, + shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index 2e9c46ec1..6d45655b4 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -100,7 +100,7 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( } iree_status_t iree_hal_xrt_lite_native_executable_create( - std::shared_ptr shim_device, + shim_xdna::device* shim_device, const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) { IREE_ASSERT_ARGUMENT(executable_params); @@ -176,8 +176,8 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); params->xclbinVector = xclbinVector; -// xrt::xclbin xclbin = xrt::xclbin(xclbinVector); -// params->context = shim_device->create_hw_context(xclbin); + // xrt::xclbin xclbin = xrt::xclbin(xclbinVector); + // params->context = shim_device->create_hw_context(xclbin); uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); @@ -186,13 +186,13 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( params->asm_inst = iree_amd_aie_hal_xrt_AsmInstDef_asm_inst_get(asminst_def); -// uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); -// size_t ctrl_code_size = num_instr * sizeof(uint32_t); -// params->bo_ctrl_code = -// shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); -// uint32_t* instr_buffer = -// static_cast(params->bo_ctrl_code->map()); -// memcpy(instr_buffer, asm_inst, ctrl_code_size); + // uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); + // size_t ctrl_code_size = num_instr * sizeof(uint32_t); + // params->bo_ctrl_code = + // shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); + // uint32_t* instr_buffer = + // static_cast(params->bo_ctrl_code->map()); + // memcpy(instr_buffer, asm_inst, ctrl_code_size); // Stash the entry point name in the string table for use when tracing. IREE_TRACE({ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index ee57055e4..a923a87d7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -36,7 +36,7 @@ struct iree_hal_xrt_lite_kernel_params_t { // |out_executable| must be released by the caller (see // iree_hal_executable_release). iree_status_t iree_hal_xrt_lite_native_executable_create( - std::shared_ptr shim_device, + shim_xdna::device* shim_device, const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc index 8a617f977..e9a04144d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -17,7 +17,7 @@ struct iree_hal_xrt_lite_nop_executable_cache_t { // Abstract resource used for injecting reference counting and vtable; must be // at offset 0. iree_hal_resource_t resource; - std::shared_ptr shim_device; + shim_xdna::device* shim_device; iree_allocator_t host_allocator; }; @@ -35,8 +35,8 @@ iree_hal_xrt_lite_nop_executable_cache_cast( } iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( - std::shared_ptr shim_device, - iree_string_view_t identifier, iree_allocator_t host_allocator, + shim_xdna::device* shim_device, iree_string_view_t identifier, + iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache) { IREE_ASSERT_ARGUMENT(out_executable_cache); *out_executable_cache = nullptr; @@ -62,7 +62,6 @@ static void iree_hal_xrt_lite_nop_executable_cache_destroy( iree_hal_xrt_lite_nop_executable_cache_cast(base_executable_cache); IREE_TRACE_ZONE_BEGIN(z0); - executable_cache->shim_device.reset(); iree_allocator_free(executable_cache->host_allocator, executable_cache); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h index 251119fdd..0322944b3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -22,7 +22,7 @@ extern "C" { // |out_executable_cache| must be released by the caller (see // iree_hal_executable_cache_release). iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( - std::shared_ptr shim_device, + shim_xdna::device* shim_device, iree_string_view_t identifier, iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt index f6a25b1d4..026524a13 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -21,6 +21,8 @@ iree_cc_library( hwctx.h hwq.cpp hwq.h + kernel.cpp + kernel.h shim_debug.cpp shim_debug.h DEPS diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index 4cb322881..27723d969 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -479,112 +479,4 @@ uint32_t bo::get_arg_bo_handles(uint32_t *handles, size_t num) const { return sz; } -exec_buf::exec_buf(const pdev &p, uint32_t op) - : m_exec_buf_bo(std::make_unique(p, AMDXDNA_INVALID_CTX_HANDLE, - MAX_EXEC_BO_SIZE, - XCL_BO_FLAGS_EXECBUF)), - m_cmd_pkt(reinterpret_cast(m_exec_buf_bo->map())), - m_cmd_size(m_exec_buf_bo->size()), - m_op(op), - m_arg_cnt(0), - m_reg_idx(0) { - std::memset(m_cmd_pkt, 0, m_cmd_size); - m_cmd_pkt->state = ERT_CMD_STATE_NEW; - m_cmd_pkt->opcode = m_op; - m_cmd_pkt->type = ERT_CU; - // One word for cu mask - inc_pkt_count(sizeof(int32_t)); -} - -void exec_buf::set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx) { - ert_start_kernel_cmd *cmd_pkt = - reinterpret_cast(bo_execbuf.map()); - cmd_pkt->cu_mask = 0x1 << cu_idx.index; -} - -void exec_buf::set_cu_idx(cuidx_t cu_idx) { - m_cmd_pkt->cu_mask = 0x1 << cu_idx.index; -} - -void exec_buf::add_ctrl_bo(bo &bo_ctrl) { - ert_start_kernel_cmd *cmd_packet = - reinterpret_cast(m_exec_buf_bo->map()); - switch (m_op) { - case ERT_START_CU: - break; - case ERT_START_NPU: { - ert_npu_data *npu_data = get_ert_npu_data(cmd_packet); - npu_data->instruction_buffer = bo_ctrl.get_paddr(); - npu_data->instruction_buffer_size = bo_ctrl.size(); - npu_data->instruction_prop_count = 0; - inc_pkt_count(sizeof(*npu_data)); - break; - } - case ERT_START_DPU: { - ert_dpu_data *dpu_data = get_ert_dpu_data(cmd_packet); - dpu_data->instruction_buffer = bo_ctrl.get_paddr(); - dpu_data->instruction_buffer_size = bo_ctrl.size(); - dpu_data->chained = 0; - inc_pkt_count(sizeof(*dpu_data)); - break; - } - default: - throw std::runtime_error("Unknown exec buf op code: " + - std::to_string(m_op)); - } -} - -void exec_buf::add_arg_32(uint32_t val) { - inc_pkt_count(sizeof(val)); - auto args = get_ert_regmap_begin(m_cmd_pkt); - args[m_reg_idx++] = val; - m_arg_cnt++; -} - -void exec_buf::add_arg_64(uint64_t val) { - inc_pkt_count(sizeof(val)); - auto args = get_ert_regmap_begin(m_cmd_pkt); - args[m_reg_idx++] = val; - args[m_reg_idx++] = val >> 32; - m_arg_cnt++; -} - -void exec_buf::add_arg_bo(bo &bo_arg, std::string arg_name) { - // Add to argument list for driver - m_exec_buf_bo->bind_at(m_arg_cnt, bo_arg, 0, bo_arg.size()); - // Add to argument list for control code patching - if (arg_name.empty()) - m_patching_args.emplace_back(std::to_string(m_arg_cnt), bo_arg.get_paddr()); - else - m_patching_args.emplace_back(arg_name, bo_arg.get_paddr()); - // Only increase m_arg_cnt now after it's used by code above. - add_arg_64(bo_arg.get_paddr()); -} - -void exec_buf::dump() { - std::cout << "Dumping exec buf:"; - int *data = static_cast(m_exec_buf_bo->map()); - std::cout << std::hex; - for (int i = 0; i < m_cmd_pkt->count + 1; i++) { - if (i % 4 == 0) std::cout << "\n"; - std::cout << std::setfill('0') << std::setw(8) << data[i] << " "; - } - std::cout << std::setfill(' ') << std::setw(0) << std::dec << std::endl; - - std::cout << "Dumping patching arguement list:\n"; - for (auto &[arg_name, arg_addr] : m_patching_args) - std::cout << "{ " << arg_name << ", 0x" << std::hex << arg_addr << std::dec - << " }\n"; -} - -void exec_buf::inc_pkt_count(uint32_t n) { - m_cmd_pkt->count += n / sizeof(int32_t); - if (m_cmd_size < - sizeof(m_cmd_pkt->header) + m_cmd_pkt->count * sizeof(int32_t)) - throw std::runtime_error("Size of exec buf too small: " + - std::to_string(m_cmd_size)); -} - -bo *exec_buf::get_exec_buf_bo() { return m_exec_buf_bo.get(); } - } // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h index 16d01fe8c..24b57566f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -13,8 +13,6 @@ namespace shim_xdna { -#define MAX_EXEC_BO_SIZE 4096 - enum xclBOSyncDirection { XCL_BO_SYNC_BO_TO_DEVICE = 0, XCL_BO_SYNC_BO_FROM_DEVICE, @@ -107,29 +105,6 @@ struct bo { uint32_t get_arg_bo_handles(uint32_t *handles, size_t num) const; }; -struct exec_buf { - std::unique_ptr m_exec_buf_bo; - ert_start_kernel_cmd *m_cmd_pkt; - size_t m_cmd_size; - uint32_t m_op; - uint32_t m_arg_cnt; - uint32_t m_reg_idx; - std::vector > m_patching_args; - - exec_buf(const pdev &p, uint32_t op); - - static void set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx); - void set_cu_idx(cuidx_t cu_idx); - bo* get_exec_buf_bo(); - - void add_ctrl_bo(bo &bo_ctrl); - void add_arg_32(uint32_t val); - void add_arg_64(uint64_t val); - void add_arg_bo(bo &bo_arg, std::string arg_name = ""); - void dump(); - void inc_pkt_count(uint32_t n); -}; - } // namespace shim_xdna #endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index 574fc8a20..013f008dd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -85,7 +85,7 @@ hw_ctx::~hw_ctx() { cuidx_t hw_ctx::open_cu_context(const std::string &cu_name) { for (uint32_t i = 0; i < m_cu_info.size(); i++) { auto &ci = m_cu_info[i]; - shim_debug("ci.m_name %s\n", ci.m_name.c_str()); + shim_debug("ci.m_name %s", ci.m_name.c_str()); if (ci.m_name == cu_name) return cuidx_t{.index = i}; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp new file mode 100644 index 000000000..b86da244a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp @@ -0,0 +1,125 @@ +// +// Created by mlevental on 10/11/24. +// + +#include "kernel.h" + +#include +#include + +#include "amdxdna_accel.h" +#include "bo.h" +#include "device.h" + +#define MAX_EXEC_BO_SIZE 4096 + +namespace shim_xdna { +kernel::kernel(const pdev &p, uint32_t op) + : m_exec_buf_bo(std::make_unique(p, AMDXDNA_INVALID_CTX_HANDLE, + MAX_EXEC_BO_SIZE, + XCL_BO_FLAGS_EXECBUF)), + m_cmd_pkt(reinterpret_cast(m_exec_buf_bo->map())), + m_cmd_size(m_exec_buf_bo->size()), + m_op(op), + m_arg_cnt(0), + m_reg_idx(0) { + std::memset(m_cmd_pkt, 0, m_cmd_size); + m_cmd_pkt->state = ERT_CMD_STATE_NEW; + m_cmd_pkt->opcode = m_op; + m_cmd_pkt->type = ERT_CU; + // One word for cu mask + inc_pkt_count(sizeof(int32_t)); +} + +void kernel::set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx) { + ert_start_kernel_cmd *cmd_pkt = + reinterpret_cast(bo_execbuf.map()); + cmd_pkt->cu_mask = 0x1 << cu_idx.index; +} + +void kernel::set_cu_idx(cuidx_t cu_idx) { + m_cmd_pkt->cu_mask = 0x1 << cu_idx.index; +} + +void kernel::add_ctrl_bo(bo &bo_ctrl) { + ert_start_kernel_cmd *cmd_packet = + reinterpret_cast(m_exec_buf_bo->map()); + switch (m_op) { + case ERT_START_CU: + break; + case ERT_START_NPU: { + ert_npu_data *npu_data = get_ert_npu_data(cmd_packet); + npu_data->instruction_buffer = bo_ctrl.get_paddr(); + npu_data->instruction_buffer_size = bo_ctrl.size(); + npu_data->instruction_prop_count = 0; + inc_pkt_count(sizeof(*npu_data)); + break; + } + case ERT_START_DPU: { + ert_dpu_data *dpu_data = get_ert_dpu_data(cmd_packet); + dpu_data->instruction_buffer = bo_ctrl.get_paddr(); + dpu_data->instruction_buffer_size = bo_ctrl.size(); + dpu_data->chained = 0; + inc_pkt_count(sizeof(*dpu_data)); + break; + } + default: + throw std::runtime_error("Unknown exec buf op code: " + + std::to_string(m_op)); + } +} + +void kernel::add_arg_32(uint32_t val) { + inc_pkt_count(sizeof(val)); + auto args = get_ert_regmap_begin(m_cmd_pkt); + args[m_reg_idx++] = val; + m_arg_cnt++; +} + +void kernel::add_arg_64(uint64_t val) { + inc_pkt_count(sizeof(val)); + auto args = get_ert_regmap_begin(m_cmd_pkt); + args[m_reg_idx++] = val; + args[m_reg_idx++] = val >> 32; + m_arg_cnt++; +} + +void kernel::add_arg_bo(bo &bo_arg, const std::string &arg_name) { + // Add to argument list for driver + m_exec_buf_bo->bind_at(m_arg_cnt, bo_arg, 0, bo_arg.size()); + // Add to argument list for control code patching + if (arg_name.empty()) + m_patching_args.emplace_back(std::to_string(m_arg_cnt), bo_arg.get_paddr()); + else + m_patching_args.emplace_back(arg_name, bo_arg.get_paddr()); + // Only increase m_arg_cnt now after it's used by code above. + add_arg_64(bo_arg.get_paddr()); +} + +void kernel::dump() { + std::cout << "Dumping exec buf:"; + int *data = static_cast(m_exec_buf_bo->map()); + std::cout << std::hex; + for (int i = 0; i < m_cmd_pkt->count + 1; i++) { + if (i % 4 == 0) std::cout << "\n"; + std::cout << std::setfill('0') << std::setw(8) << data[i] << " "; + } + std::cout << std::setfill(' ') << std::setw(0) << std::dec << std::endl; + + std::cout << "Dumping patching arguement list:\n"; + for (auto &[arg_name, arg_addr] : m_patching_args) + std::cout << "{ " << arg_name << ", 0x" << std::hex << arg_addr << std::dec + << " }\n"; +} + +void kernel::inc_pkt_count(uint32_t n) const { + m_cmd_pkt->count += n / sizeof(int32_t); + if (m_cmd_size < + sizeof(m_cmd_pkt->header) + m_cmd_pkt->count * sizeof(int32_t)) + throw std::runtime_error("Size of exec buf too small: " + + std::to_string(m_cmd_size)); +} + +bo *kernel::get_exec_buf_bo() const { return m_exec_buf_bo.get(); } + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h new file mode 100644 index 000000000..2993a8465 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h @@ -0,0 +1,35 @@ +// +// Created by mlevental on 10/11/24. +// + +#ifndef KERNEL_H +#define KERNEL_H + +#include "bo.h" + +namespace shim_xdna { +struct kernel { + std::unique_ptr m_exec_buf_bo; + ert_start_kernel_cmd *m_cmd_pkt; + size_t m_cmd_size; + uint32_t m_op; + uint32_t m_arg_cnt; + uint32_t m_reg_idx; + std::vector > m_patching_args; + + kernel(const pdev &p, uint32_t op); + + static void set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx); + void set_cu_idx(cuidx_t cu_idx); + bo *get_exec_buf_bo() const; + + void add_ctrl_bo(bo &bo_ctrl); + void add_arg_32(uint32_t val); + void add_arg_64(uint64_t val); + void add_arg_bo(bo &bo_arg, const std::string &arg_name = ""); + void dump(); + void inc_pkt_count(uint32_t n) const; +}; +} // namespace shim_xdna + +#endif // KERNEL_H diff --git a/tests/conftest.py b/tests/conftest.py index 3bc6d4daa..3b1518c21 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -80,6 +80,7 @@ def iree_session(request, pytestconfig, global_cl_args) -> Session: f"--iree-amd-aie-install-dir={pytestconfig.option.iree_install_dir}", f"--iree-amd-aie-enable-chess={use_chess}", f"--iree-amdaie-enable-packet-flow={enable_packet_flow}", + "--iree-amdaie-device-hal=xrt-lite", ] if pytestconfig.option.vitis_dir: flags += [f"--iree-amd-aie-vitis-install-dir={pytestconfig.option.vitis_dir}"] @@ -106,7 +107,7 @@ def session_module(iree_session, tmp_path) -> ir.Module: @pytest.fixture(scope="session") -def device(device="xrt") -> ir.Module: +def device(device="xrt-lite") -> ir.Module: yield get_driver(device).create_default_device() From f34f5b2945f93d69b9925ab3a3621b28ef1fab86 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Sat, 12 Oct 2024 04:15:02 -0400 Subject: [PATCH 12/35] remove xclbin (and XRT) dep --- .github/workflows/ci-linux.yml | 41 +-- build_tools/ci/run_matmul_test.sh | 18 +- .../AMD-AIE/iree-amd-aie/CMakeLists.txt | 1 + .../iree-amd-aie/PluginRegistration.cpp | 11 +- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 300 ++++++++++++------ .../AMD-AIE/iree-amd-aie/Target/AIETarget.h | 10 + .../iree-amd-aie/Target/CMakeLists.txt | 1 + .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 129 +++++--- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.h | 5 +- iree_compiler_plugin.cmake | 2 +- iree_runtime_plugin.cmake | 5 +- .../driver/xrt-lite/CMakeLists.txt | 2 +- .../driver/xrt-lite/cts/CMakeLists.txt | 50 +-- .../xrt-lite/cts/executable_cache_test.cc | 2 +- .../xrt-lite/cts/executable_cache_test.mlir | 2 +- .../xrt-lite/cts/matmul_dispatch_test.cc | 2 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 2 +- .../driver/xrt-lite/direct_command_buffer.cc | 10 +- .../driver/xrt-lite/executable.cc | 105 +++--- .../iree-amd-aie/driver/xrt-lite/executable.h | 4 +- .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 2 - .../driver/xrt-lite/shim/linux/kmq/device.cpp | 11 +- .../driver/xrt-lite/shim/linux/kmq/device.h | 7 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 145 +++------ .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 15 +- .../xrt-lite/shim/linux/kmq/shim_debug.cpp | 1 + 26 files changed, 456 insertions(+), 427 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 7f1cd1f56..44a3eb03b 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -51,7 +51,7 @@ jobs: git reset --hard FETCH_HEAD git -c submodule."third_party/torch-mlir".update=none \ -c submodule."third_party/stablehlo".update=none \ - -c submodule."src/runtime_src/core/common/aiebu".update=none \ + -c submodule."third_party/XRT".update=none \ submodule update --init --recursive --depth 1 --single-branch -j 10 - name: Install deps @@ -77,6 +77,11 @@ jobs: key: ${{ env.CACHE_KEY }} restore-keys: linux-build-test-cpp- + - name: Peano dep + run: | + bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + - name: Build packages run: | export cache_dir="${{ env.CACHE_DIR }}" @@ -158,56 +163,29 @@ jobs: - name : E2E comparison of AIE to llvm-cpu run: | source .venv/bin/activate - source /opt/xilinx/xrt/setup.sh python build_tools/ci/cpu_comparison/run.py \ test_aie_vs_cpu \ $PWD/iree-install \ $PWD/llvm-aie \ - --xrt-dir /opt/xilinx/xrt \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ --reset-npu-between-runs -v - name: E2E correctness matmul test run: | - # Without this additional line an error like - # - # [XRT] ERROR: Failed to allocate host memory buffer (mmap(len=10616832, prot=3, flags=8193, offset=4294967296) - # failed (err=11): Resource temporarily unavailable), make sure host bank is enabled (see xbutil configure --host-mem) - # iree-amd-aie/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc:179: RESOURCE_EXHAUSTED; could not allocate - # memory for buffer; while invoking C++ function matmul_test.generate_random_matrix; while calling import; - # - # might be observed when too much memory is allocated. This - # error was seen when running a bf16->f32 matmul with m=n=k=2304. - # - # This line was suggested at https://github.com/Xilinx/mlir-air/issues/566 - # - # Note that this is only half of the fix. It is also necessary that - # the machine that CI is running on has permission to run this line. - # - # This permission can be adding by adding the line - # ``` - # %github ALL=(ALL) NOPASSWD: /usr/bin/prlimit * - # ``` - # - # to the file /etc/sudoers.d/github, which can be done by running - # ``` - # sudo visudo -f /etc/sudoers.d/github - # ``` - # on the github CI machine. + # https://stackoverflow.com/a/17567422 + # shim_xdna::bo::map_drm_bo does an mmap with MAP_LOCKED + # which can fail if limit is to low sudo prlimit -lunlimited --pid $$ source .venv/bin/activate - source /opt/xilinx/xrt/setup.sh bash build_tools/ci/run_matmul_test.sh \ test_matmuls \ iree-install \ $PWD/llvm-aie \ - /opt/xilinx/xrt \ /opt/Xilinx/Vitis/2024.2 - name: Python tests run: | source .venv/bin/activate - source /opt/xilinx/xrt/setup.sh pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ @@ -215,7 +193,6 @@ jobs: - name: XRT-LITE tests run: | - source /opt/xilinx/xrt/setup.sh DEVICE_TEST_DIR="$PWD/iree-install/device_tests" for t in $(ls $DEVICE_TEST_DIR); do $DEVICE_TEST_DIR/$t diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index d6170df29..5ced68435 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -102,21 +102,11 @@ if [ ! -d "${PEANO}" ]; then exit 1 fi -# Parameter 4) +# Parameter 4) if [ -z "${4-}" ]; then - XRT_DIR=/opt/xilinx/xrt -else - XRT_DIR=`realpath "$4"` -fi -if [ -d "$XRT_DIR" ]; then - source $XRT_DIR/setup.sh -fi - -# Parameter 5) -if [ -z "${5-}" ]; then VITIS=/opt/Xilinx/Vitis/2024.2 else - VITIS=`realpath "$5"` + VITIS=`realpath "$4"` fi THIS_DIR="$(cd $(dirname $0) && pwd)" @@ -139,9 +129,6 @@ fi GITHUB_ACTIONS="${GITHUB_ACTIONS:-false}" -# Circumvent xclbin security (no longer needed as of April 2024 XDNA driver) -export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 - cd ${OUTPUT_DIR} export MATMUL_TESTS_RUN=0 @@ -405,6 +392,7 @@ function run_matmul_test() { --iree-amd-aie-enable-chess=${use_chess} \ --iree-amdaie-enable-packet-flow=${enable_packet_flow} \ --iree-hal-dump-executable-files-to=$PWD \ + --iree-amdaie-device-hal=xrt-lite \ --iree-hal-memoization=false \ --iree-hal-indirect-command-buffers=false \ --mlir-elide-resource-strings-if-larger=10 \ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt index b6077f9c5..bd5865430 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt @@ -26,6 +26,7 @@ iree_cc_library( iree::base::core_headers iree::base::internal::flatcc::building iree-amd-aie::schemas::xrt_executable_def_c_fbs + iree-amd-aie::schemas::pdi_executable_def_c_fbs PUBLIC ) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp index 8a3903e81..9fe88cd64 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp @@ -46,9 +46,16 @@ struct AMDAIESession } void populateHALTargetDevices(IREE::HAL::TargetDeviceList &targets) override { + // #hal.device.target<"xrt", ... + targets.add("xrt", [=] { + options.deviceHal = AMDAIE::AMDAIEOptions::DeviceHAL::XRT; + return AMDAIE::createTarget(options); + }); // #hal.device.target<"xrt-lite", ... - // #hal.executable.target<"amd-aie", ... - targets.add("xrt-lite", [=]() { return AMDAIE::createTarget(options); }); + targets.add("xrt-lite", [=] { + options.deviceHal = AMDAIE::AMDAIEOptions::DeviceHAL::XRT_LITE; + return AMDAIE::createTarget(options); + }); } void populateHALTargetBackends( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index 7a03142c9..acb0de8f5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -7,6 +7,7 @@ #include "iree-amd-aie/Target/AIETarget.h" #include +#include #include "XCLBinGen.h" #include "aie/AIEDialect.h" @@ -18,6 +19,8 @@ #include "air/Dialect/AIRRt/AIRRtDialect.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/schemas/pdi_executable_def_builder.h" +#include "iree-amd-aie/schemas/xrt_executable_def_builder.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h" #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" @@ -43,7 +46,6 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Target/LLVMIR/Dialect/All.h" -#include "runtime/plugins/AMD-AIE/iree-amd-aie/schemas/xrt_executable_def_builder.h" #define DEBUG_TYPE "aie-target" @@ -56,18 +58,20 @@ static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp, moduleOp.walk([&](xilinx::AIE::DeviceOp d) { ++nDeviceOpsVisited; // This attribute should've been set in the dma-to-npu pass. - auto maybeName = d->getAttrOfType("runtime_sequence_name"); + StringAttr maybeName = + d->getAttrOfType("runtime_sequence_name"); if (!maybeName) return WalkResult::advance(); - auto name = maybeName.getValue(); + StringRef name = maybeName.getValue(); if (name != targetName) return WalkResult::advance(); deviceOp = d; return WalkResult::interrupt(); }); - if (!deviceOp) + if (!deviceOp) { moduleOp.emitError() << "visited " << nDeviceOpsVisited << " aie.device ops, and failed to find one with name " << targetName; + } return deviceOp; } @@ -84,7 +88,7 @@ static void sanitizeForBootgen(std::string &symbol) { class AIETargetDevice final : public IREE::HAL::TargetDevice { public: - AIETargetDevice(const AMDAIEOptions &options) : options(options) {} + AIETargetDevice(AMDAIEOptions options) : options(std::move(options)) {} IREE::HAL::DeviceTargetAttr getDefaultDeviceTarget( MLIRContext *context, @@ -101,8 +105,17 @@ class AIETargetDevice final : public IREE::HAL::TargetDevice { targetRegistry.getTargetBackend("amd-aie")->getDefaultExecutableTargets( context, "amd-aie", configAttr, executableTargetAttrs); - return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("xrt-lite"), - configAttr, executableTargetAttrs); + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + return IREE::HAL::DeviceTargetAttr::get( + context, b.getStringAttr("xrt"), configAttr, executableTargetAttrs); + case AMDAIEOptions::DeviceHAL::XRT_LITE: + return IREE::HAL::DeviceTargetAttr::get( + context, b.getStringAttr("xrt-lite"), configAttr, + executableTargetAttrs); + default: + llvm_unreachable("unsupported device HAL\n"); + } } private: @@ -111,9 +124,19 @@ class AIETargetDevice final : public IREE::HAL::TargetDevice { class AIETargetBackend final : public IREE::HAL::TargetBackend { public: - explicit AIETargetBackend(const AMDAIEOptions &options) : options(options) {} - - std::string getLegacyDefaultDeviceID() const override { return "xrt-lite"; } + explicit AIETargetBackend(AMDAIEOptions options) + : options(std::move(options)) {} + + std::string getLegacyDefaultDeviceID() const override { + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + return "xrt"; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + return "xrt-lite"; + default:; + llvm::report_fatal_error("unsupported default device\n"); + }; + } void getDefaultExecutableTargets( MLIRContext *context, StringRef deviceID, DictionaryAttr deviceConfigAttr, @@ -139,9 +162,19 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { addConfig("ukernels", StringAttr::get(context, options.enableAMDAIEUkernels)); auto configAttr = b.getDictionaryAttr(configItems); - return IREE::HAL::ExecutableTargetAttr::get( - context, b.getStringAttr("amd-aie"), - b.getStringAttr("amdaie-xclbin-fb"), configAttr); + + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + return IREE::HAL::ExecutableTargetAttr::get( + context, b.getStringAttr("amd-aie"), + b.getStringAttr("amdaie-xclbin-fb"), configAttr); + case AMDAIEOptions::DeviceHAL::XRT_LITE: + return IREE::HAL::ExecutableTargetAttr::get( + context, b.getStringAttr("amd-aie"), + b.getStringAttr("amdaie-pdi-fb"), configAttr); + default:; + llvm::report_fatal_error("unsupported default HAL\n"); + }; } void getDependentDialects(DialectRegistry ®istry) const override { @@ -191,48 +224,92 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { AMDAIEOptions options; }; +void serializeXCLBinToFb(FlatbufferBuilder &builder, + flatbuffers_string_vec_ref_t entryPointsRef, + SmallVector &asmInstrIndices, + SmallVector &xclbinIndices, + SmallVector xclbinRefs, + SmallVector asmInstrRefs) { + iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef); + flatbuffers_int32_vec_ref_t asmInstrIndicesRef = + builder.createInt32Vec(asmInstrIndices); + iree_amd_aie_hal_xrt_ExecutableDef_asm_instr_indices_add(builder, + asmInstrIndicesRef); + flatbuffers_int32_vec_ref_t xclbinIndicesRef = + builder.createInt32Vec(xclbinIndices); + iree_amd_aie_hal_xrt_ExecutableDef_xclbin_indices_add(builder, + xclbinIndicesRef); + flatbuffers_vec_ref_t xclbinsRef = + builder.createOffsetVecDestructive(xclbinRefs); + iree_amd_aie_hal_xrt_ExecutableDef_xclbins_add(builder, xclbinsRef); + flatbuffers_vec_ref_t asmInstrsRef = + builder.createOffsetVecDestructive(asmInstrRefs); + iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_add(builder, asmInstrsRef); + iree_amd_aie_hal_xrt_ExecutableDef_end_as_root(builder); +} + +void serializePDIToFb(FlatbufferBuilder &builder, + flatbuffers_string_vec_ref_t entryPointsRef, + SmallVector &asmInstrIndices, + SmallVector &pdiIndices, + SmallVector pdiRefs, + SmallVector asmInstrRefs) { + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_add(builder, + entryPointsRef); + flatbuffers_int32_vec_ref_t asmInstrIndicesRef = + builder.createInt32Vec(asmInstrIndices); + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instr_indices_add( + builder, asmInstrIndicesRef); + flatbuffers_int32_vec_ref_t pdiIndicesRef = + builder.createInt32Vec(pdiIndices); + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdi_indices_add(builder, + pdiIndicesRef); + flatbuffers_vec_ref_t pdisRef = builder.createOffsetVecDestructive(pdiRefs); + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_add(builder, pdisRef); + flatbuffers_vec_ref_t asmInstrsRef = + builder.createOffsetVecDestructive(asmInstrRefs); + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_add(builder, asmInstrsRef); + iree_amd_aie_hal_xrt_lite_ExecutableDef_end_as_root(builder); +} + LogicalResult AIETargetBackend::serializeExecutable( const SerializationOptions &serOptions, IREE::HAL::ExecutableVariantOp variantOp, OpBuilder &executableBuilder) { ModuleOp moduleOp = variantOp.getInnerModule(); - auto basename = + std::string basename = llvm::join_items("_", serOptions.dumpBaseName, variantOp.getName()); sanitizeForBootgen(basename); - auto maybeWorkDir = [&]() -> FailureOr> { - // If a path for intermediates has been specified, assume it is common for - // all executables compiling in parallel, and so create an - // executable-specific subdir to keep this executable's intermediates - // separate. - if (!serOptions.dumpIntermediatesPath.empty()) { - SmallString<128> workDir{serOptions.dumpIntermediatesPath}; - llvm::sys::path::append(workDir, basename); - auto ecode = llvm::sys::fs::create_directories(workDir); - if (ecode) { - return moduleOp.emitError() - << "failed to create working directory " << workDir - << ". Error message : " << ecode.message(); - } - return workDir; - } + FailureOr> maybeWorkDir; + // If a path for intermediates has been specified, assume it is common for + // all executables compiling in parallel, and so create an + // executable-specific subdir to keep this executable's intermediates + // separate. + if (!serOptions.dumpIntermediatesPath.empty()) { + SmallString<128> workDir{serOptions.dumpIntermediatesPath}; + llvm::sys::path::append(workDir, basename); + if (auto ecode = llvm::sys::fs::create_directories(workDir)) { + return moduleOp.emitError() + << "failed to create working directory " << workDir + << ". Error message : " << ecode.message(); + } + maybeWorkDir = workDir; + } else { // No path for intermediates: make a temporary directory for this // executable that is certain to be distinct from the dir of any other // executable. SmallString<128> workDirFromScratch; - auto err = llvm::sys::fs::createUniqueDirectory( - /* prefix = */ variantOp.getName(), workDirFromScratch); - - if (err) + if (auto err = llvm::sys::fs::createUniqueDirectory( + /*prefix=*/variantOp.getName(), workDirFromScratch)) { return moduleOp.emitOpError() - << "failed to create working directory for xclbin generation: " + << "failed to create working directory for artifact generation: " << err.message(); + } + maybeWorkDir = workDirFromScratch; + } - return workDirFromScratch; - }(); - - if (failed(maybeWorkDir)) return failure(); - auto workDir = maybeWorkDir.value(); + SmallString<128> workDir = maybeWorkDir.value(); // collect names of kernels as they need to be in kernels.json // generated by `aie2xclbin` SmallVector entryPointNames; @@ -241,7 +318,7 @@ LogicalResult AIETargetBackend::serializeExecutable( // Map to keep track of which ordinal number belongs to which entry point, // typically the order is sequential but that is not gauranteed std::map entryPointOrdinals; - for (auto exportOp : variantOp.getExportOps()) { + for (IREE::HAL::ExecutableExportOp exportOp : variantOp.getExportOps()) { uint64_t ordinal = 0; if (std::optional optionalOrdinal = exportOp.getOrdinal()) { ordinal = optionalOrdinal->getZExtValue(); @@ -269,56 +346,69 @@ LogicalResult AIETargetBackend::serializeExecutable( // error out if we think the name will most likely be too long // for the artifact generation to succeed. We set this cut-off at 50 // characters. - if (entryPointName.size() > 50) + if (entryPointName.size() > 50) { return exportOp.emitError() << "entry point name: " << entryPointName << "is too long!"; + } } + uint64_t ordinalCount = entryPointOrdinals.size(); if (entryPointNames.empty()) { return moduleOp.emitOpError("should contain some entry points"); } - std::unique_ptr xclbinIn; - + std::unique_ptr artifactInput; FlatbufferBuilder builder; - iree_amd_aie_hal_xrt_ExecutableDef_start_as_root(builder); - SmallVector xclbinRefs; - SmallVector asmInstrRefs; + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + iree_amd_aie_hal_xrt_ExecutableDef_start_as_root(builder); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + iree_amd_aie_hal_xrt_lite_ExecutableDef_start_as_root(builder); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } + + SmallVector refs; + SmallVector asmInstrRefs; // Per entry-point data. // Note that the following vectors should all be of the same size and // element at index #i is for entry point with ordinal #i! SmallVector entryPointNamesFb(ordinalCount); - SmallVector xclbinIndices(ordinalCount); + SmallVector indices(ordinalCount); SmallVector asmInstrIndices(ordinalCount); for (size_t i = 0; i < entryPointNames.size(); i++) { uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]); - entryPointNamesFb[ordinal] = entryPointNames[i]; std::string errorMessage; - - // we add the entry point to the working directory for xclbin artifacts if - // there are multiple entry points so that we dont overwrite the xclbinutil + // we add the entry point to the working directory for artifacts if + // there are multiple entry points so that we don't overwrite the // generated artifacts e.g kernels.json, for different entry points which // will have the same exact names. SmallString<128> entryPointWorkDir(workDir); - if (ordinalCount > 1) + if (ordinalCount > 1) { llvm::sys::path::append(entryPointWorkDir, entryPointNamesFb[ordinal]); - auto err = llvm::sys::fs::create_directories(entryPointWorkDir); - if (err) + } + + if (auto err = llvm::sys::fs::create_directories(entryPointWorkDir)) { return moduleOp.emitOpError() - << "failed to create working directory for xclbin generation: " + << "failed to create working directory for pdi generation: " << err.message(); + } llvm::outs().flush(); - SmallString<128> xclbinPath(entryPointWorkDir); - llvm::sys::path::append(xclbinPath, entryPointNamesFb[ordinal] + ".xclbin"); + + SmallString<128> artifactPath(entryPointWorkDir); + llvm::sys::path::append(artifactPath, entryPointNamesFb[ordinal] + ".pdi"); SmallString<128> npuInstPath(entryPointWorkDir); llvm::sys::path::append(npuInstPath, entryPointNamesFb[ordinal] + ".npu.txt"); - // Convert ordinal to hexadecimal string for xclbin kernel id. + // Convert ordinal to hexadecimal string for pdi kernel id. std::stringstream ordinalHex; ordinalHex << "0x" << std::hex << ordinal; @@ -342,6 +432,7 @@ LogicalResult AIETargetBackend::serializeExecutable( // TODO(max): this should be an enum // TODO(max): this needs to be pulled from PCIE std::string npuVersion; + std::string targetArch; switch (options.AMDAIETargetDevice) { case AMDAIEDevice::npu1: case AMDAIEDevice::npu1_1col: @@ -349,18 +440,21 @@ LogicalResult AIETargetBackend::serializeExecutable( case AMDAIEDevice::npu1_3col: case AMDAIEDevice::npu1_4col: npuVersion = "npu1"; + targetArch = "AIE2"; break; case AMDAIEDevice::npu4: npuVersion = "npu4"; + targetArch = "AIE2P"; break; default: - llvm::report_fatal_error("unhandled NPU partitioning.\n"); + llvm::errs() << "unhandled NPU partitioning.\n"; + return failure(); } if (failed(aie2xclbin( /*ctx=*/variantOp->getContext(), deviceOps[i], /*outputNPU=*/npuInstPath.str().str(), - /*outputXCLBin=*/xclbinPath.str().str(), + /*artifactPath=*/artifactPath.str().str(), /*printIRBeforeAll=*/options.aie2xclbinPrintIrBeforeAll, /*printIRAfterAll=*/options.aie2xclbinPrintIrAfterAll, /*printIRModuleScope=*/options.aie2xclbinPrintIrModuleScope, @@ -371,17 +465,18 @@ LogicalResult AIETargetBackend::serializeExecutable( /*vitisDir=*/options.vitisInstallDir.empty() ? std::nullopt : std::optional{options.vitisInstallDir}, - // TODO(max): not right for strix - /*targetArch=*/"AIE2", + /*targetArch=*/targetArch, /*npuVersion=*/npuVersion, /*peanoDir=*/options.peanoInstallDir, + /*deviceHal=*/options.deviceHal, /*xclBinKernelID=*/ordinalHex.str(), /*xclBinKernelName=*/entryPointNamesFb[ordinal], /*xclBinInstanceName=*/"IREE", /*amdAIEInstallDir=*/options.amdAieInstallDir, /*InputXCLBin=*/std::nullopt, - /*ukernel=*/options.enableAMDAIEUkernels))) + /*ukernel=*/options.enableAMDAIEUkernels))) { return failure(); + } std::ifstream instrFile(static_cast(npuInstPath)); std::string line; @@ -395,40 +490,61 @@ LogicalResult AIETargetBackend::serializeExecutable( } npuInstrs.push_back(a); } - auto npuInstrsVec = builder.createInt32Vec(npuInstrs); + flatbuffers_int32_vec_ref_t npuInstrsVec = + builder.createInt32Vec(npuInstrs); asmInstrIndices[ordinal] = asmInstrRefs.size(); - asmInstrRefs.push_back( - iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); - xclbinIn = openInputFile(xclbinPath, &errorMessage); - if (!xclbinIn) { - moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; + if (options.deviceHal == AMDAIEOptions::DeviceHAL::XRT_LITE) { + asmInstrRefs.push_back( + iree_amd_aie_hal_xrt_lite_AsmInstDef_create(builder, npuInstrsVec)); + } else if (options.deviceHal == AMDAIEOptions::DeviceHAL::XRT) { + asmInstrRefs.push_back( + iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); + } else { + llvm::report_fatal_error("unsupported backend"); } - auto xclbinStringRef = builder.createString(xclbinIn->getBuffer()); - xclbinIndices[ordinal] = xclbinRefs.size(); - xclbinRefs.push_back( - iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); - } - // Serialize the executable to flatbuffer format - auto entryPointsRef = builder.createStringVec(entryPointNamesFb); - iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef); - - flatbuffers_int32_vec_ref_t asmInstrIndicesRef = - builder.createInt32Vec(asmInstrIndices); - iree_amd_aie_hal_xrt_ExecutableDef_asm_instr_indices_add(builder, - asmInstrIndicesRef); - flatbuffers_int32_vec_ref_t xclbinIndicesRef = - builder.createInt32Vec(xclbinIndices); - iree_amd_aie_hal_xrt_ExecutableDef_xclbin_indices_add(builder, - xclbinIndicesRef); - auto xclbinsRef = builder.createOffsetVecDestructive(xclbinRefs); - iree_amd_aie_hal_xrt_ExecutableDef_xclbins_add(builder, xclbinsRef); + artifactInput = openInputFile(artifactPath, &errorMessage); + if (!artifactInput) { + moduleOp.emitOpError() + << "Failed to open artifact file: " << errorMessage; + } + flatbuffers_string_ref_t artifactStringRef = + builder.createString(artifactInput->getBuffer()); + indices[ordinal] = refs.size(); + + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + refs.push_back( + iree_amd_aie_hal_xrt_XclbinDef_create(builder, artifactStringRef)); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + refs.push_back(iree_amd_aie_hal_xrt_lite_PdiDef_create( + builder, artifactStringRef)); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } + } - auto asmInstrsRef = builder.createOffsetVecDestructive(asmInstrRefs); - iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_add(builder, asmInstrsRef); + // Serialize the executable to flatbuffer format + flatbuffers_string_vec_ref_t entryPointsRef = + builder.createStringVec(entryPointNamesFb); + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + serializeXCLBinToFb(builder, entryPointsRef, asmInstrIndices, indices, + refs, asmInstrRefs); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + serializePDIToFb(builder, entryPointsRef, asmInstrIndices, indices, refs, + asmInstrRefs); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } - iree_amd_aie_hal_xrt_ExecutableDef_end_as_root(builder); auto binaryOp = executableBuilder.create( variantOp.getLoc(), variantOp.getSymName(), variantOp.getTarget().getFormat(), diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h index 1f6518909..91007c988 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h @@ -54,6 +54,9 @@ struct AMDAIEOptions { std::string enableAMDAIEUkernels{"none"}; bool enablePacketFlow{false}; + enum class DeviceHAL { XRT, XRT_LITE }; + DeviceHAL deviceHal{DeviceHAL::XRT}; + void bindOptions(OptionsBinder &binder) { static llvm::cl::OptionCategory category("AMD AIE Options"); binder.opt( @@ -187,6 +190,13 @@ struct AMDAIEOptions { binder.opt("iree-amdaie-enable-packet-flow", enablePacketFlow, llvm::cl::cat(category), llvm::cl::desc("Enable packet routing data movement.")); + + binder.opt( + "iree-amdaie-device-hal", deviceHal, llvm::cl::cat(category), + llvm::cl::desc("Sets the target device HAL."), + llvm::cl::values(clEnumValN(DeviceHAL::XRT, "xrt", "xrt device HAL"), + clEnumValN(DeviceHAL::XRT_LITE, "xrt-lite", + "xrt-lite device HAL"))); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt index 3c7cd4d64..63a38950a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt @@ -34,6 +34,7 @@ iree_cc_library( DEPS ::AIETargets iree-amd-aie::schemas::xrt_executable_def_c_fbs + iree-amd-aie::schemas::pdi_executable_def_c_fbs iree::base::internal::flatcc::building iree::base::internal::flatcc::parsing iree::compiler::Dialect::HAL::Target diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index a1d155269..7e4aded8a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -728,6 +728,60 @@ static json::Object makeKernelJSON(const std::string &name, {"instances", json::Array{json::Object{{"name", instance}}}}}; } +static LogicalResult generatePDI(const std::string &Output, + const Path &tempDir) { + std::string errorMessage; + // Create design.bif. + Path designBifFile = tempDir / "design.bif"; + { + auto designBifOut = openOutputFile(designBifFile.string(), &errorMessage); + if (!designBifOut) { + llvm::errs() << "failed to open design.bif because: " << errorMessage; + return failure(); + } + + designBifOut->os() << "all:\n" + << "{\n" + << " id_code = 0x14ca8093\n" + << " extended_id_code = 0x01\n" + << " image\n" + << " {\n" + << " name=aie_image, id=0x1c000000\n" + << " { type=cdo\n" + << " file=" << tempDir.string() + << "/aie_cdo_elfs.bin\n" + << " file=" << tempDir.string() + << "/aie_cdo_init.bin\n" + << " file=" << tempDir.string() + << "/aie_cdo_enable.bin\n" + << " }\n" + << " }\n" + << "}"; + designBifOut->keep(); + } + + // Execute the bootgen command. + { + // first element is empty string because iree_aie_bootgen_main + // is the main of bootgen.exe (and argv[0] is typically the name of the exe) + std::vector flags = { + "", "-arch", "versal", "-image", designBifFile.string(), + "-o", Output, "-w"}; + std::vector cstrings; + cstrings.reserve(flags.size()); + for (const auto &inputFlag : flags) { + cstrings.push_back(const_cast(inputFlag.c_str())); + } + if (iree_aie_bootgen_main(cstrings.size(), + const_cast(&cstrings[0]))) { + llvm::errs() << "failed to execute bootgen"; + return failure(); + } + } + + return success(); +} + static LogicalResult generateXCLBin( const std::string &Output, const Path &tempDir, const std::string &xclBinKernelID, const std::string &xclBinKernelName, @@ -832,58 +886,11 @@ static LogicalResult generateXCLBin( return failure(); } } - // Create design.bif. - Path designBifFile = tempDir / "design.bif"; - { - auto designBifOut = openOutputFile(designBifFile.string(), &errorMessage); - if (!designBifOut) { - llvm::errs() << "failed to open design.bif because: " << errorMessage; - return failure(); - } - designBifOut->os() << "all:\n" - << "{\n" - << " id_code = 0x14ca8093\n" - << " extended_id_code = 0x01\n" - << " image\n" - << " {\n" - << " name=aie_image, id=0x1c000000\n" - << " { type=cdo\n" - << " file=" << tempDir.string() - << "/aie_cdo_elfs.bin\n" - << " file=" << tempDir.string() - << "/aie_cdo_init.bin\n" - << " file=" << tempDir.string() - << "/aie_cdo_enable.bin\n" - << " }\n" - << " }\n" - << "}"; - designBifOut->keep(); + if (failed(generatePDI((tempDir / "design.pdi").string(), tempDir))) { + return failure(); } - // Execute the bootgen command. - { - // first element is empty string because iree_aie_bootgen_main - // is the main of bootgen.exe (and argv[0] is typically the name of the exe) - std::vector flags = {"", - "-arch", - "versal", - "-image", - designBifFile.string(), - "-o", - (tempDir / "design.pdi").string(), - "-w"}; - std::vector cstrings; - cstrings.reserve(flags.size()); - for (const auto &inputFlag : flags) { - cstrings.push_back(const_cast(inputFlag.c_str())); - } - if (iree_aie_bootgen_main(cstrings.size(), - const_cast(&cstrings[0]))) { - llvm::errs() << "failed to execute bootgen"; - return failure(); - } - } std::vector flags; // Execute the xclbinutil command. std::string memArg = "MEM_TOPOLOGY:JSON:" + memTopologyJsonFile.string(); @@ -1111,11 +1118,12 @@ LogicalResult emitNpuInstructions(AIE::DeviceOp deviceOp, LogicalResult aie2xclbin( MLIRContext *ctx, AIE::DeviceOp deviceOp, const std::string &outputNPU, - const std::string &outputXCLBin, bool printIRBeforeAll, + const std::string &artifactPath, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, const std::string &tempDir, bool useChess, bool verbose, const std::optional &vitisDir, const std::string &targetArch, const std::string &npuVersion, const std::string &peanoDir, + const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal, const std::string &xclBinKernelID, const std::string &xclBinKernelName, const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir, const std::optional &InputXCLBin, @@ -1150,7 +1158,26 @@ LogicalResult aie2xclbin( return failure(); } - if (failed(generateXCLBin(outputXCLBin, tempDirPath, xclBinKernelID, + Path pdiPath = tempDirPath / "design.pdi"; + if (failed(generatePDI(pdiPath, tempDirPath))) { + llvm::errs() << "Failed to generate PDI\n"; + return failure(); + } + + if (deviceHal == AMDAIEOptions::DeviceHAL::XRT_LITE) { + std::error_code ec; + if (!std::filesystem::copy_file( + pdiPath, artifactPath, + std::filesystem::copy_options::overwrite_existing, ec)) { + llvm::errs() << "Failed to copy file because: " << ec.message() << "\n"; + return failure(); + } + return success(); + } + + assert(deviceHal == AMDAIEOptions::DeviceHAL::XRT && + "generating XCLBin for non-XRT HAL"); + if (failed(generateXCLBin(artifactPath, tempDirPath, xclBinKernelID, xclBinKernelName, xclBinInstanceName, amdAIEInstallDir, verbose, InputXCLBin))) { llvm::errs() << "Failed to generate XCLBin\n"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h index cd7bd2f2a..6083e7293 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h @@ -7,18 +7,19 @@ #include +#include "AIETarget.h" #include "aie/AIEDialect.h" -#include "mlir/IR/MLIRContext.h" #include "mlir/Support/LogicalResult.h" namespace mlir::iree_compiler::AMDAIE { mlir::LogicalResult aie2xclbin( mlir::MLIRContext *ctx, xilinx::AIE::DeviceOp, const std::string &outputNPU, - const std::string &outputXCLBin, bool printIRBeforeAll, + const std::string &artifactPath, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, const std::string &tempDir, bool useChess, bool verbose, const std::optional &vitisDir, const std::string &targetArch, const std::string &npuVersion, const std::string &peanoDir, + const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal, const std::string &xclBinKernelID, const std::string &xclBinKernelName, const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir, const std::optional &InputXCLBin, diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index 3b50361c8..a707091ca 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -23,7 +23,7 @@ if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) endif() -if(IREE_AMD_AIE_ENABLE_XRT_DRIVER OR IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) +if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) include(iree_aie_xrt) endif() include(iree_aie_bootgen) diff --git a/iree_runtime_plugin.cmake b/iree_runtime_plugin.cmake index 594ca4ca0..0bc5637b5 100644 --- a/iree_runtime_plugin.cmake +++ b/iree_runtime_plugin.cmake @@ -27,8 +27,11 @@ if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) endif() -if(IREE_AMD_AIE_ENABLE_XRT_DRIVER OR IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) +if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) include(iree_aie_xrt) +endif() + +if(IREE_AMD_AIE_ENABLE_XRT_DRIVER OR IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) include(iree_aie_bootgen) endif() diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt index d5e5ecea1..0863ea8c3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -42,7 +42,7 @@ iree_cc_library( iree::hal::utils::deferred_command_buffer iree::hal::utils::semaphore_base iree::base::internal::flatcc::parsing - iree-amd-aie::schemas::xrt_executable_def_c_fbs + iree-amd-aie::schemas::pdi_executable_def_c_fbs iree-amd-aie::driver::xrt-lite::shim::linux::kmq::shim-xdna PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt index a8125ec00..03642c0c2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -16,7 +16,7 @@ iree_hal_cts_test_suite( COMPILER_TARGET_BACKEND "amd-aie" EXECUTABLE_FORMAT - "\"amdaie-xclbin-fb\"" + "\"amdaie-pdi-fb\"" DEPS iree-amd-aie::driver::xrt-lite::registration INCLUDED_TESTS @@ -49,11 +49,10 @@ iree_bytecode_module( --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} --iree-amd-aie-vitis-install-dir=${VITIS_DIR} --iree-amd-aie-enable-chess=$ + --iree-amdaie-device-hal=xrt-lite --iree-amd-aie-show-invoked-commands --iree-hal-memoization=false --iree-hal-indirect-command-buffers=false - DEPS - iree-aie-xclbinutil PUBLIC TESTONLY ) @@ -78,51 +77,6 @@ iree_c_embed_data( TESTONLY ) -#iree_bytecode_module( -# NAME -# xrt_lite_command_buffer_dispatch_test_module -# MODULE_FILE_NAME -# xrt_lite_command_buffer_dispatch_test.bin -# SRC -# "${CMAKE_CURRENT_LIST_DIR}/command_buffer_dispatch_test.mlir" -# FLAGS -# --compile-mode=hal-executable -# --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} -# --iree-hal-target-backends=amd-aie -# --iree-amdaie-lower-to-aie-pipeline=air -# --iree-amdaie-target-device=${TARGET_DEVICE} -# --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} -# --iree-amd-aie-vitis-install-dir=${VITIS_DIR} -# --iree-amd-aie-enable-chess=$ -# --iree-amd-aie-show-invoked-commands -# --iree-hal-memoization=false -# --iree-hal-indirect-command-buffers=false -# DEPS -# iree-aie-xclbinutil -# PUBLIC -# TESTONLY -#) -# -#iree_c_embed_data( -# NAME -# xrt_lite_command_buffer_dispatch_c -# SRCS -# xrt_lite_command_buffer_dispatch_test.bin -# C_FILE_OUTPUT -# xrt_lite_command_buffer_dispatch_c.c -# H_FILE_OUTPUT -# xrt_lite_command_buffer_dispatch_c.h -# IDENTIFIER -# iree_cts_testdata_command_buffer_dispatch_aie_xrt_lite -# STRIP_PREFIX -# xrt_lite_ -# DEPENDS -# ::xrt_lite_command_buffer_dispatch_test_module -# FLATTEN -# PUBLIC -# TESTONLY -#) - iree_cc_test( NAME xrt_lite_executable_cache_test diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc index 0904d33a6..9aa19a89c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc @@ -21,7 +21,7 @@ iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { return iree_hal_xrt_lite_driver_module_register(registry); } -const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } +const char* get_test_executable_format() { return "amdaie-pdi-fb"; } iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { const struct iree_file_toc_t* toc = diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir index dedbcab6b..ca306e1e5 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir @@ -30,4 +30,4 @@ hal.executable.source public @amdaie_fb { return } } -} +} \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc index f00bfbddc..ce7d4ca83 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc @@ -23,7 +23,7 @@ iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { return iree_hal_xrt_lite_driver_module_register(registry); } -const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } +const char* get_test_executable_format() { return "amdaie-pdi-fb"; } iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { const struct iree_file_toc_t* toc = diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index efa301dbc..e2c5ac2bc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -106,7 +106,7 @@ struct iree_hal_xrt_lite_device { if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) { *out_value = - iree_string_view_equal(key, IREE_SV("amdaie-xclbin-fb")) ? 1 : 0; + iree_string_view_equal(key, IREE_SV("amdaie-pdi-fb")) ? 1 : 0; return iree_ok_status(); } return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index 1a4123877..a15f11334 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -274,9 +274,8 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &executable)); - xrt::xclbin xclbin = xrt::xclbin(kernel_params.xclbinVector); - kernel_params.context = - command_buffer->shim_device->create_hw_context(xclbin); + kernel_params.context = command_buffer->shim_device->create_hw_context( + kernel_params.pdiVector, kernel_params.kernel_name); uint32_t num_instr = flatbuffers_uint32_vec_len(kernel_params.asm_inst); size_t ctrl_code_size = num_instr * sizeof(uint32_t); auto bo_ctrl_code = command_buffer->shim_device->alloc_bo( @@ -285,9 +284,8 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( memcpy(instr_buffer, kernel_params.asm_inst, ctrl_code_size); bo_ctrl_code->sync(shim_xdna::direction::host2device); - std::string cu_name = kernel_params.kernel_name; - cu_name += ":IREE"; - shim_xdna::cuidx_t cu_idx = kernel_params.context->open_cu_context(cu_name); + shim_xdna::cuidx_t cu_idx = + kernel_params.context->open_cu_context(kernel_params.kernel_name); shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); ebuf.set_cu_idx(cu_idx); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index 6d45655b4..7108ae2e7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -10,8 +10,8 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h" -#include "iree-amd-aie/schemas/xrt_executable_def_reader.h" -#include "iree-amd-aie/schemas/xrt_executable_def_verifier.h" +#include "iree-amd-aie/schemas/pdi_executable_def_reader.h" +#include "iree-amd-aie/schemas/pdi_executable_def_verifier.h" #include "iree/base/api.h" struct iree_hal_xrt_lite_native_executable_t { @@ -53,7 +53,7 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( // Run flatcc generated verification. This ensures all pointers are in-bounds // and that we can safely walk the file, but not that the actual contents of // the flatbuffer meet our expectations. - int verify_ret = iree_amd_aie_hal_xrt_ExecutableDef_verify_as_root( + int verify_ret = iree_amd_aie_hal_xrt_lite_ExecutableDef_verify_as_root( flatbuffer_data.data, flatbuffer_data.data_length); if (verify_ret != flatcc_verify_ok) { return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, @@ -61,11 +61,11 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( flatcc_verify_error_string(verify_ret)); } - iree_amd_aie_hal_xrt_ExecutableDef_table_t executable_def = - iree_amd_aie_hal_xrt_ExecutableDef_as_root(flatbuffer_data.data); + iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root(flatbuffer_data.data); flatbuffers_string_vec_t entry_points_vec = - iree_amd_aie_hal_xrt_ExecutableDef_entry_points_get(executable_def); + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); if (entry_point_count == 0) { return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, @@ -79,16 +79,17 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( } } - iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins = - iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def); - size_t number_xclbin = iree_amd_aie_hal_xrt_XclbinDef_vec_len(xclbins); - if (number_xclbin == 0) { - return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no xclbin present"); + iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); + size_t number_pdi = iree_amd_aie_hal_xrt_lite_PdiDef_vec_len(pdis); + if (number_pdi == 0) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no pdi present"); } - iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instr = - iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def); - size_t number_asm_instr = iree_amd_aie_hal_xrt_AsmInstDef_vec_len(asm_instr); + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instr = + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); + size_t number_asm_instr = + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_len(asm_instr); if (number_asm_instr != entry_point_count) { return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "number of entry points (%zu) and number of asm " @@ -114,19 +115,20 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( z0, iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( executable_params->executable_data)); - iree_amd_aie_hal_xrt_ExecutableDef_table_t executable_def = - iree_amd_aie_hal_xrt_ExecutableDef_as_root( + iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root( executable_params->executable_data.data); - flatbuffers_uint32_vec_t xclbin_indices_vec = - iree_amd_aie_hal_xrt_ExecutableDef_xclbin_indices_get(executable_def); + flatbuffers_uint32_vec_t pdi_indices_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdi_indices_get(executable_def); flatbuffers_uint32_vec_t asm_instr_indices_vec = - iree_amd_aie_hal_xrt_ExecutableDef_asm_instr_indices_get(executable_def); + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instr_indices_get( + executable_def); flatbuffers_string_vec_t entry_points_vec = - iree_amd_aie_hal_xrt_ExecutableDef_entry_points_get(executable_def); - iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins_vec = - iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def); - iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instrs_vec = - iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def); + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); + iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instrs_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); iree_host_size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); @@ -165,34 +167,23 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( &executable->entry_points[entry_ordinal]; params->kernel_name = flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); - uint32_t xclbin_index = - flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); - iree_amd_aie_hal_xrt_XclbinDef_table_t xclbin_def = - iree_amd_aie_hal_xrt_XclbinDef_vec_at(xclbins_vec, xclbin_index); - flatbuffers_string_t xclbin_fb = - iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def); - - // XRT API needs this vector and cant actually read a void*. - std::vector xclbinVector( - xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); - params->xclbinVector = xclbinVector; - // xrt::xclbin xclbin = xrt::xclbin(xclbinVector); - // params->context = shim_device->create_hw_context(xclbin); - + uint32_t pdi_index = + flatbuffers_uint32_vec_at(pdi_indices_vec, entry_ordinal); + iree_amd_aie_hal_xrt_lite_PdiDef_table_t pdi_def = + iree_amd_aie_hal_xrt_lite_PdiDef_vec_at(pdis_vec, pdi_index); + flatbuffers_string_t pdi_fb = + iree_amd_aie_hal_xrt_lite_PdiDef_pdi_get(pdi_def); + + std::vector pdiVector(pdi_fb, + pdi_fb + flatbuffers_string_len(pdi_fb)); + params->pdiVector = pdiVector; uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); - iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def = - iree_amd_aie_hal_xrt_AsmInstDef_vec_at(asm_instrs_vec, asm_instr_index); + iree_amd_aie_hal_xrt_lite_AsmInstDef_table_t asminst_def = + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_at(asm_instrs_vec, + asm_instr_index); params->asm_inst = - iree_amd_aie_hal_xrt_AsmInstDef_asm_inst_get(asminst_def); - - // uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); - // size_t ctrl_code_size = num_instr * sizeof(uint32_t); - // params->bo_ctrl_code = - // shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); - // uint32_t* instr_buffer = - // static_cast(params->bo_ctrl_code->map()); - // memcpy(instr_buffer, asm_inst, ctrl_code_size); + iree_amd_aie_hal_xrt_lite_AsmInstDef_asm_inst_get(asminst_def); // Stash the entry point name in the string table for use when tracing. IREE_TRACE({ @@ -202,18 +193,18 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( }); IREE_TRACE({ - if (iree_amd_aie_hal_xrt_ExecutableDef_source_locations_is_present( + if (iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_is_present( executable_def)) { - iree_amd_aie_hal_xrt_FileLineLocDef_vec_t source_locs_vec = - iree_amd_aie_hal_xrt_ExecutableDef_source_locations_get( + iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_t source_locs_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_get( executable_def); - iree_amd_aie_hal_xrt_FileLineLocDef_table_t source_loc = - iree_amd_aie_hal_xrt_FileLineLocDef_vec_at(source_locs_vec, - entry_ordinal); + iree_amd_aie_hal_xrt_lite_FileLineLocDef_table_t source_loc = + iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_at(source_locs_vec, + entry_ordinal); flatbuffers_string_t filename = - iree_amd_aie_hal_xrt_FileLineLocDef_filename_get(source_loc); + iree_amd_aie_hal_xrt_lite_FileLineLocDef_filename_get(source_loc); uint32_t line = - iree_amd_aie_hal_xrt_FileLineLocDef_line_get(source_loc); + iree_amd_aie_hal_xrt_lite_FileLineLocDef_line_get(source_loc); params->source_filename = iree_make_string_view(filename, flatbuffers_string_len(filename)); params->source_line = line; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index a923a87d7..57b477b4c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -25,7 +25,7 @@ extern "C" { struct iree_hal_xrt_lite_kernel_params_t { std::unique_ptr context; std::unique_ptr bo_ctrl_code; - std::vector xclbinVector; + std::vector pdiVector; flatbuffers_uint32_vec_t asm_inst; // Number of assembly instructions argument to the kernel std::string kernel_name; @@ -46,7 +46,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( iree_hal_xrt_lite_kernel_params_t* out_params); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt index 026524a13..9d9c40fd1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -27,9 +27,7 @@ iree_cc_library( shim_debug.h DEPS uuid - $ COPTS $<$:-fexceptions -frtti> - $<$:/EHsc /GR> PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 3b1ddb73a..458f59305 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -150,13 +150,14 @@ device::~device() { shim_debug("Destroying KMQ device"); } const pdev &device::get_pdev() const { return m_pdev; } std::unique_ptr device::create_hw_context( - const xrt::xclbin &xclbin, const std::map &qos) { - return std::make_unique(*this, xclbin, qos); + const std::vector &pdi, const std::string &cu_name, + const std::map &qos) { + return std::make_unique(*this, pdi, cu_name, qos); } -std::unique_ptr device::create_hw_context(const xrt::xclbin &xclbin) { - const std::map qos{}; - return std::make_unique(*this, xclbin, qos); +std::unique_ptr device::create_hw_context( + const std::vector &pdi, const std::string &cu_name) { + return std::make_unique(*this, pdi, cu_name); } std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index 23ffd3f27..1076f72f1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -7,7 +7,6 @@ #include #include -#include "experimental/xrt_xclbin.h" #include "fence.h" #include "xrt_mem.h" @@ -46,8 +45,10 @@ struct device { std::unique_ptr import_bo(pid_t, int); std::unique_ptr create_hw_context( - const xrt::xclbin &xclbin, const std::map &qos); - std::unique_ptr create_hw_context(const xrt::xclbin &xclbin); + const std::vector &pdi, const std::string &cu_name, + const std::map &qos); + std::unique_ptr create_hw_context(const std::vector &pdi, + const std::string &cu_name); std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, uint32_t size); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index 013f008dd..f362a2f80 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -7,30 +7,14 @@ #include #include "bo.h" -#include "core/common/api/xclbin_int.h" #include "hwq.h" #include "shim_debug.h" -namespace { - -std::vector get_pdi(const xrt_core::xclbin::aie_partition_obj &aie, - uint16_t kernel_id) { - for (auto &pdi : aie.pdis) { - for (auto &cdo : pdi.cdo_groups) { - for (auto kid : cdo.kernel_ids) { - if (kid == kernel_id) return pdi.pdi; - } - } - } - shim_xdna::shim_err(ENOENT, "PDI for kernel ID 0x%x not found", kernel_id); -} - -} // namespace - namespace shim_xdna { hw_ctx::hw_ctx(device &dev, const std::map &qos, - std::unique_ptr q, const xrt::xclbin &xclbin) + std::unique_ptr q, const std::vector &pdi, + const std::string &cu_name, size_t functional) : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { shim_debug("Creating HW context..."); @@ -49,27 +33,52 @@ hw_ctx::hw_ctx(device &dev, const std::map &qos, m_qos.priority = value; } - auto axlf = xclbin.get_axlf(); - auto aie_partition = xrt_core::xclbin::get_aie_partition(axlf); - - for (const auto &k : xclbin.get_kernels()) { - auto &props = xrt_core::xclbin_int::get_properties(k); - try { - for (const auto &cu : k.get_cus()) { - m_cu_info.push_back({.m_name = cu.get_name(), - .m_func = props.functional, - .m_pdi = get_pdi(aie_partition, props.kernel_id)}); - } - } catch (std::system_error &ex) { - if (ex.code().value() != ENOENT) throw; - shim_debug("%s", ex.what()); - } - } + m_cu_info.push_back({.m_name = cu_name, .m_func = functional, .m_pdi = pdi}); if (m_cu_info.empty()) shim_err(EINVAL, "No valid DPU kernel found in xclbin"); - m_ops_per_cycle = aie_partition.ops_per_cycle; - m_num_cols = aie_partition.ncol; + m_ops_per_cycle = 2048 /*aie_partition.ops_per_cycle*/; + m_num_cols = 4 /*aie_partition.ncol*/; +} + +hw_ctx::hw_ctx(device &device, const std::vector &pdi, + const std::string &cu_name, + const std::map &qos) + : hw_ctx(device, qos, std::make_unique(device), pdi, cu_name) { + create_ctx_on_device(); + std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + + m_cu_info.size() * + sizeof(amdxdna_cu_config)); + auto cu_conf_param = reinterpret_cast( + cu_conf_param_buf.data()); + + cu_conf_param->num_cus = m_cu_info.size(); + shim_xcl_bo_flags f = {}; + f.flags = XRT_BO_FLAGS_CACHEABLE; + for (int i = 0; i < m_cu_info.size(); i++) { + cu_info &ci = m_cu_info[i]; + + m_pdi_bos.push_back(alloc_bo(ci.m_pdi.size(), f)); + std::unique_ptr &pdi_bo = m_pdi_bos[i]; + char *pdi_vaddr = reinterpret_cast(pdi_bo->map()); + + // see cu_configs[1] in amdxdna_hwctx_param_config_cu + assert(i < 1 && "only 1 CU supported"); + amdxdna_cu_config &cf = cu_conf_param->cu_configs[i]; + std::memcpy(pdi_vaddr, ci.m_pdi.data(), ci.m_pdi.size()); + pdi_bo->sync(direction::host2device, pdi_bo->get_properties().size, 0); + cf.cu_bo = pdi_bo->get_drm_bo_handle(); + cf.cu_func = ci.m_func; + } + + amdxdna_drm_config_hwctx arg = {}; + arg.handle = m_handle; + arg.param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU; + arg.param_val = reinterpret_cast(cu_conf_param); + arg.param_val_size = cu_conf_param_buf.size(); + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &arg); + + shim_debug("Created KMQ HW context (%d)", m_handle); } hw_ctx::~hw_ctx() { @@ -93,7 +102,12 @@ cuidx_t hw_ctx::open_cu_context(const std::string &cu_name) { } std::unique_ptr hw_ctx::alloc_bo(size_t size, shim_xcl_bo_flags flags) { - return alloc_bo(nullptr, size, flags); + // const_cast: alloc_bo() is not const yet in device class + // Debug buffer is specific to one context. + if (flags.use == XRT_BO_USE_DEBUG) + return m_device.alloc_bo(m_handle, size, flags); + // Other BOs are shared across all contexts. + return m_device.alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); } std::unique_ptr hw_ctx::import_bo(pid_t pid, int ehdl) { @@ -138,7 +152,7 @@ void hw_ctx::init_log_buf() { auto log_buf_size = m_num_cols * 1024; shim_xcl_bo_flags f; f.flags = XCL_BO_FLAGS_EXECBUF; - m_log_bo = alloc_bo(nullptr, log_buf_size, f); + m_log_bo = alloc_bo(log_buf_size, f); m_log_buf = m_log_bo->map(); std::memset(m_log_buf, 0, log_buf_size); } @@ -147,59 +161,4 @@ void hw_ctx::fini_log_buf() const { if (m_log_bo) m_log_bo->unmap(m_log_buf); } -hw_ctx::hw_ctx(device &device, const xrt::xclbin &xclbin, - const std::map &qos) - : hw_ctx(device, qos, std::make_unique(device), xclbin) { - create_ctx_on_device(); - std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + - m_cu_info.size() * - sizeof(amdxdna_cu_config)); - auto cu_conf_param = reinterpret_cast( - cu_conf_param_buf.data()); - - cu_conf_param->num_cus = m_cu_info.size(); - shim_xcl_bo_flags f = {}; - f.flags = XRT_BO_FLAGS_CACHEABLE; - for (int i = 0; i < m_cu_info.size(); i++) { - cu_info &ci = m_cu_info[i]; - - m_pdi_bos.push_back(alloc_bo(ci.m_pdi.size(), f)); - std::unique_ptr &pdi_bo = m_pdi_bos[i]; - char *pdi_vaddr = reinterpret_cast(pdi_bo->map()); - - // see cu_configs[1] in amdxdna_hwctx_param_config_cu - assert(i < 1 && "only 1 CU supported"); - amdxdna_cu_config &cf = cu_conf_param->cu_configs[i]; - std::memcpy(pdi_vaddr, ci.m_pdi.data(), ci.m_pdi.size()); - pdi_bo->sync(direction::host2device, pdi_bo->get_properties().size, 0); - cf.cu_bo = pdi_bo->get_drm_bo_handle(); - cf.cu_func = ci.m_func; - } - - amdxdna_drm_config_hwctx arg = {}; - arg.handle = m_handle; - arg.param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU; - arg.param_val = reinterpret_cast(cu_conf_param); - arg.param_val_size = cu_conf_param_buf.size(); - m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &arg); - - shim_debug("Created KMQ HW context (%d)", m_handle); -} - -std::unique_ptr hw_ctx::alloc_bo(void *userptr, size_t size, - shim_xcl_bo_flags flags) { - // const_cast: alloc_bo() is not const yet in device class - // Debug buffer is specific to one context. - if (flags.use == XRT_BO_USE_DEBUG) - return m_device.alloc_bo(m_handle, size, flags); - // Other BOs are shared across all contexts. - return m_device.alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); -} - -std::unique_ptr create_hw_context( - device &dev, const xrt::xclbin &xclbin, - const std::map &qos) { - return std::make_unique(dev, xclbin, qos); -} - } // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index a145beda5..15fc2b481 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -50,14 +50,13 @@ struct hw_ctx { std::vector> m_pdi_bos; hw_ctx(device &dev, const std::map &qos, - std::unique_ptr q, const xrt::xclbin &xclbin); - hw_ctx(device &dev, const xrt::xclbin &xclbin, - const std::map &qos); + std::unique_ptr q, const std::vector &pdi, + const std::string &cu_name, size_t functional = 0); + hw_ctx(device &dev, const std::vector &pdi, + const std::string &cu_name, + const std::map &qos = {}); ~hw_ctx(); - // TODO - std::unique_ptr alloc_bo(void *userptr, size_t size, - shim_xcl_bo_flags flags); std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); @@ -70,10 +69,6 @@ struct hw_ctx { hw_q *get_hw_queue() const; }; -std::unique_ptr create_hw_context( - device &dev, const xrt::xclbin &xclbin, - const std::map &qos); - } // namespace shim_xdna #endif // _HWCTX_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp index 698b1a59b..a41e16193 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp @@ -31,4 +31,5 @@ void debugf(const char *format, ...) { va_start(args, format); vprintf(format, args); va_end(args); + fflush(stdout); } From d9c83cca95435b571001209928c716dbb7772806 Mon Sep 17 00:00:00 2001 From: makslevental Date: Sat, 12 Oct 2024 18:22:37 -0400 Subject: [PATCH 13/35] removed non-load-bearing vtable functions --- .github/workflows/ci-windows.yml | 3 - .../iree-amd-aie/driver/xrt-lite/allocator.cc | 108 +----------------- .../driver/xrt-lite/direct_command_buffer.cc | 93 +-------------- .../iree-amd-aie/driver/xrt-lite/driver.cc | 12 -- .../driver/xrt-lite/executable.cc | 2 +- .../driver/xrt-lite/nop_executable_cache.cc | 12 +- .../driver/xrt-lite/nop_semaphore.cc | 55 +-------- .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 2 + .../xrt-lite/shim/linux/kmq/shim_debug.h | 2 +- .../src/iree-amd-aie/driver/xrt-lite/util.h | 8 ++ 10 files changed, 30 insertions(+), 267 deletions(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 4a2c06a2e..d45f22396 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -102,9 +102,6 @@ jobs: .\build_tools\build_llvm.ps1 # Remove-Item -Path "$pwd\llvm-build" -Force $env:llvm_install_dir = "$pwd\llvm-install" - echo $env:llvm_install_dir - .\build_tools\download_peano.ps1 - $env:peano_install_dir = "$pwd\llvm-aie" .\build_tools.\build_test_cpp.ps1 - name: Create artifacts diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 7280035d7..cde48e95b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -43,37 +43,6 @@ struct iree_hal_xrt_lite_allocator { ~iree_hal_xrt_lite_allocator() = default; - iree_status_t trim() { - // TODO(null): if the allocator is retaining any unused resources they - // should be dropped here. If the underlying implementation has pools or - // caches it should be notified that a trim is requested. This is called in - // low-memory situations or when IREE is not going to be used for awhile - // (low power modes or suspension). - (void)this; - - return iree_ok_status(); - } - - void query_statistics(iree_hal_allocator_statistics_t* out_statistics) { - IREE_STATISTICS({ - memcpy(out_statistics, &this->statistics, sizeof(*out_statistics)); - // TODO(null): update statistics (merge). - }); - } - - iree_status_t query_memory_heaps(iree_host_size_t capacity, - iree_hal_allocator_memory_heap_t* heaps, - iree_host_size_t* out_count) { - // TODO(null): return heap information. This is called at least once with a - // capacity that may be 0 (indicating a query for the total count) and the - // heaps should only be populated if capacity is sufficient to store all of - // them. - (void)this; - iree_status_t status = iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "heap query not implemented"); - return status; - } - iree_hal_buffer_compatibility_t query_buffer_compatibility( iree_hal_buffer_params_t* params, iree_device_size_t* allocation_size) { // TODO(null): set compatibility rules based on the implementation. @@ -193,70 +162,6 @@ struct iree_hal_xrt_lite_allocator { iree_hal_buffer_destroy(base_buffer); } - - iree_status_t import_buffer( - const iree_hal_buffer_params_t* params, - iree_hal_external_buffer_t* external_buffer, - iree_hal_buffer_release_callback_t release_callback, - iree_hal_buffer_t** out_buffer) { - // Coerce options into those required by the current device. - iree_hal_buffer_params_t compat_params = *params; - iree_device_size_t allocation_size = external_buffer->size; - iree_hal_buffer_compatibility_t compatibility = - this->query_buffer_compatibility(&compat_params, &allocation_size); - if (!iree_all_bits_set(compatibility, - IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) { - // TODO(benvanik): make a helper for this. -#if IREE_STATUS_MODE - iree_bitfield_string_temp_t temp0, temp1, temp2; - iree_string_view_t memory_type_str = - iree_hal_memory_type_format(params->type, &temp0); - iree_string_view_t usage_str = - iree_hal_buffer_usage_format(params->usage, &temp1); - iree_string_view_t compatibility_str = - iree_hal_buffer_compatibility_format(compatibility, &temp2); - return iree_make_status( - IREE_STATUS_INVALID_ARGUMENT, - "allocator cannot import a buffer with the given parameters; " - "memory_type=%.*s, usage=%.*s, compatibility=%.*s", - (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, - usage_str.data, (int)compatibility_str.size, compatibility_str.data); -#else - return iree_make_status( - IREE_STATUS_INVALID_ARGUMENT, - "allocator cannot import a buffer with the given parameters"); -#endif // IREE_STATUS_MODE - } - - // TODO(null): switch on external_buffer->type and import the buffer. See - // the headers for more information on semantics. Most implementations can - // service IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION by just wrapping - // the underlying device pointer. Those that can service - // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION may be able to avoid a lot - // of additional copies when moving data around between host and device or - // across devices from different drivers. - (void)this; - iree_status_t status = iree_make_status( - IREE_STATUS_UNIMPLEMENTED, "external buffer type not supported"); - - return status; - } - - iree_status_t export_buffer(iree_hal_buffer_t* buffer, - iree_hal_external_buffer_type_t requested_type, - iree_hal_external_buffer_flags_t requested_flags, - iree_hal_external_buffer_t* out_external_buffer) { - // TODO(null): switch on requested_type and export as appropriate. Most - // implementations can service - // IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION by just exposing the - // underlying device pointer. Those that can service - // IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION may be able to avoid a lot - // of additional copies when moving data around between host and device or - // across devices from different drivers. - (void)this; - return iree_make_status(IREE_STATUS_UNAVAILABLE, - "external buffer type not supported"); - } }; static iree_hal_xrt_lite_allocator* iree_hal_xrt_lite_allocator_cast( @@ -318,28 +223,19 @@ static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( #define ALLOCATOR_MEMBER_VOID(member) \ MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, member) -ALLOCATOR_MEMBER_STATUS(trim); -ALLOCATOR_MEMBER_VOID(query_statistics); -ALLOCATOR_MEMBER_STATUS(query_memory_heaps); ALLOCATOR_MEMBER(query_buffer_compatibility, iree_hal_buffer_compatibility_t); ALLOCATOR_MEMBER_STATUS(allocate_buffer); ALLOCATOR_MEMBER_VOID(deallocate_buffer); -ALLOCATOR_MEMBER_STATUS(import_buffer); -ALLOCATOR_MEMBER_STATUS(export_buffer); namespace { const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { .destroy = iree_hal_xrt_lite_allocator_destroy, .host_allocator = iree_hal_xrt_lite_allocator_host_allocator, - .trim = iree_hal_xrt_lite_allocator_trim, - .query_statistics = iree_hal_xrt_lite_allocator_query_statistics, - .query_memory_heaps = iree_hal_xrt_lite_allocator_query_memory_heaps, + .trim = unimplemented_ok_status, + .query_statistics = unimplemented_ok_void, .query_buffer_compatibility = iree_hal_xrt_lite_allocator_query_buffer_compatibility, .allocate_buffer = iree_hal_xrt_lite_allocator_allocate_buffer, .deallocate_buffer = iree_hal_xrt_lite_allocator_deallocate_buffer, - .import_buffer = iree_hal_xrt_lite_allocator_import_buffer, - .export_buffer = iree_hal_xrt_lite_allocator_export_buffer, }; - } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index a15f11334..bfcf3c0d2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -11,6 +11,7 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h" #include "iree/hal/utils/resource_set.h" +#include "util.h" // The max number of bindings per descriptor set allowed in the XRT HAL // implementation. @@ -115,12 +116,6 @@ static void iree_hal_xrt_lite_direct_command_buffer_destroy( IREE_TRACE_ZONE_END(z0); } -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_begin( - iree_hal_command_buffer_t* base_command_buffer) { - // Nothing to do. - return iree_ok_status(); -} - static iree_status_t iree_hal_xrt_lite_direct_command_buffer_end( iree_hal_command_buffer_t* base_command_buffer) { iree_hal_xrt_lite_direct_command_buffer* command_buffer = @@ -136,70 +131,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_end( return iree_ok_status(); } -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_execution_barrier( - iree_hal_command_buffer_t* base_command_buffer, - iree_hal_execution_stage_t source_stage_mask, - iree_hal_execution_stage_t target_stage_mask, - iree_hal_execution_barrier_flags_t flags, - iree_host_size_t memory_barrier_count, - const iree_hal_memory_barrier_t* memory_barriers, - iree_host_size_t buffer_barrier_count, - const iree_hal_buffer_barrier_t* buffer_barriers) { - if (iree_any_bit_set(source_stage_mask, IREE_HAL_EXECUTION_STAGE_HOST) || - iree_any_bit_set(target_stage_mask, IREE_HAL_EXECUTION_STAGE_HOST)) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "barrier involving host not yet supported"); - } - - if (flags != IREE_HAL_EXECUTION_BARRIER_FLAG_NONE) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "non-zero barrier flag not yet supported"); - } - - // Nothing to do in current synchronous mode. - - return iree_ok_status(); -} - -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_signal_event( - iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event, - iree_hal_execution_stage_t source_stage_mask) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported"); -} - -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_reset_event( - iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event, - iree_hal_execution_stage_t source_stage_mask) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported"); -} - -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_wait_events( - iree_hal_command_buffer_t* base_command_buffer, - iree_host_size_t event_count, const iree_hal_event_t** events, - iree_hal_execution_stage_t source_stage_mask, - iree_hal_execution_stage_t target_stage_mask, - iree_host_size_t memory_barrier_count, - const iree_hal_memory_barrier_t* memory_barriers, - iree_host_size_t buffer_barrier_count, - const iree_hal_buffer_barrier_t* buffer_barriers) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "event not yet supported"); -} - -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_discard_buffer( - iree_hal_command_buffer_t* base_command_buffer, - iree_hal_buffer_ref_t buffer) { - // It is okay to do nothing here. - return iree_ok_status(); -} - -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_fill_buffer( - iree_hal_command_buffer_t* base_command_buffer, - iree_hal_buffer_ref_t target_ref, const void* pattern, - iree_host_size_t pattern_length) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "fill buffer not yet supported"); -} - static iree_status_t iree_hal_xrt_lite_direct_command_buffer_update_buffer( iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer, iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) { @@ -245,14 +176,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_copy_buffer( return iree_ok_status(); } -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_collective( - iree_hal_command_buffer_t* base_command_buffer, iree_hal_channel_t* channel, - iree_hal_collective_op_t op, uint32_t param, iree_hal_buffer_ref_t send_ref, - iree_hal_buffer_ref_t recv_ref, iree_device_size_t element_count) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "collectives not yet supported"); -} - static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( iree_hal_command_buffer_t* base_command_buffer, iree_hal_executable_t* executable, int32_t entry_point, @@ -328,19 +251,11 @@ namespace { const iree_hal_command_buffer_vtable_t iree_hal_xrt_lite_direct_command_buffer_vtable = { .destroy = iree_hal_xrt_lite_direct_command_buffer_destroy, - .begin = iree_hal_xrt_lite_direct_command_buffer_begin, - .end = iree_hal_xrt_lite_direct_command_buffer_end, - .execution_barrier = - iree_hal_xrt_lite_direct_command_buffer_execution_barrier, - .signal_event = iree_hal_xrt_lite_direct_command_buffer_signal_event, - .reset_event = iree_hal_xrt_lite_direct_command_buffer_reset_event, - .wait_events = iree_hal_xrt_lite_direct_command_buffer_wait_events, - .discard_buffer = - iree_hal_xrt_lite_direct_command_buffer_discard_buffer, - .fill_buffer = iree_hal_xrt_lite_direct_command_buffer_fill_buffer, + .begin = unimplemented_ok_status, + .end = unimplemented_ok_status, + .execution_barrier = unimplemented_ok_status, .update_buffer = iree_hal_xrt_lite_direct_command_buffer_update_buffer, .copy_buffer = iree_hal_xrt_lite_direct_command_buffer_copy_buffer, - .collective = iree_hal_xrt_lite_direct_command_buffer_collective, .dispatch = iree_hal_xrt_lite_direct_command_buffer_dispatch, .dispatch_indirect = iree_hal_xrt_lite_direct_command_buffer_dispatch_indirect, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 6dc67be78..e6a7b5061 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -37,13 +37,6 @@ void iree_hal_xrt_lite_driver_options_initialize( &out_options->default_device_options); } -static iree_status_t iree_hal_xrt_lite_driver_options_verify( - const iree_hal_xrt_lite_driver_options_t* options) { - // TODO(null): verify that the parameters are within expected ranges and any - // requested features are supported. - return iree_ok_status(); -} - IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, const iree_hal_xrt_lite_driver_options_t* options, @@ -53,11 +46,6 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( *out_driver = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - // TODO(null): verify options; this may be moved after any libraries are - // loaded so the verification can use underlying implementation queries. - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_xrt_lite_driver_options_verify(options)); - iree_hal_xrt_lite_driver_t* driver = nullptr; iree_host_size_t total_size = sizeof(*driver) + identifier.size; IREE_RETURN_AND_END_ZONE_IF_ERROR( diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index 7108ae2e7..08ff85dc0 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -249,6 +249,6 @@ iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( namespace { const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable = { - /*.destroy=*/iree_hal_xrt_lite_native_executable_destroy, + .destroy = iree_hal_xrt_lite_native_executable_destroy, }; } // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc index e9a04144d..8d0be5ad4 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -72,7 +72,7 @@ static bool iree_hal_xrt_lite_nop_executable_cache_can_prepare_format( iree_hal_executable_caching_mode_t caching_mode, iree_string_view_t executable_format) { return iree_string_view_equal(executable_format, - iree_make_cstring_view("XRT")); + iree_make_cstring_view("PDIR")); } static iree_status_t iree_hal_xrt_lite_nop_executable_cache_prepare_executable( @@ -89,10 +89,10 @@ static iree_status_t iree_hal_xrt_lite_nop_executable_cache_prepare_executable( namespace { const iree_hal_executable_cache_vtable_t iree_hal_xrt_lite_nop_executable_cache_vtable = { - /*.destroy = */ iree_hal_xrt_lite_nop_executable_cache_destroy, - /*.can_prepare_format = */ - iree_hal_xrt_lite_nop_executable_cache_can_prepare_format, - /*.prepare_executable = */ - iree_hal_xrt_lite_nop_executable_cache_prepare_executable, + .destroy = iree_hal_xrt_lite_nop_executable_cache_destroy, + .can_prepare_format = + iree_hal_xrt_lite_nop_executable_cache_can_prepare_format, + .prepare_executable = + iree_hal_xrt_lite_nop_executable_cache_prepare_executable, }; } // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc index 17810350f..173db9483 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc @@ -6,10 +6,9 @@ #include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" -#include - #include "iree/base/api.h" #include "iree/hal/utils/semaphore_base.h" +#include "util.h" struct iree_hal_xrt_lite_semaphore_t { iree_hal_semaphore_t base; @@ -62,54 +61,12 @@ static void iree_hal_xrt_lite_semaphore_destroy( IREE_TRACE_ZONE_END(z0); } -static iree_status_t iree_hal_xrt_lite_semaphore_query( - iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) { - iree_hal_xrt_lite_semaphore_t* semaphore = - iree_hal_xrt_lite_semaphore_cast(base_semaphore); - // TODO: Support semaphores completely. - *out_value = - iree_atomic_load_int64(&semaphore->value, iree_memory_order_acquire); - return iree_ok_status(); -} - -static iree_status_t iree_hal_xrt_lite_semaphore_signal( - iree_hal_semaphore_t* base_semaphore, uint64_t new_value) { - iree_hal_xrt_lite_semaphore_t* semaphore = - iree_hal_xrt_lite_semaphore_cast(base_semaphore); - // TODO: Support semaphores completely. Return OK currently as everything is - // synchronized for each submit to allow things to run. - iree_atomic_store_int64(&semaphore->value, new_value, - iree_memory_order_release); - iree_hal_semaphore_poll(&semaphore->base); - return iree_ok_status(); -} - -static void iree_hal_xrt_lite_semaphore_fail( - iree_hal_semaphore_t* base_semaphore, iree_status_t status) { - iree_hal_xrt_lite_semaphore_t* semaphore = - iree_hal_xrt_lite_semaphore_cast(base_semaphore); - // TODO: save status and mark timepoint as failed. - iree_status_ignore(status); - iree_hal_semaphore_poll(&semaphore->base); -} - -static iree_status_t iree_hal_xrt_lite_semaphore_wait( - iree_hal_semaphore_t* base_semaphore, uint64_t value, - iree_timeout_t timeout) { - iree_hal_xrt_lite_semaphore_t* semaphore = - iree_hal_xrt_lite_semaphore_cast(base_semaphore); - // TODO: Support semaphores completely. Return OK currently as everything is - // synchronized for each submit to allow things to run. - iree_hal_semaphore_poll(&semaphore->base); - return iree_ok_status(); -} - namespace { const iree_hal_semaphore_vtable_t iree_hal_xrt_lite_semaphore_vtable = { - /*.destroy = */ iree_hal_xrt_lite_semaphore_destroy, - /*.query = */ iree_hal_xrt_lite_semaphore_query, - /*.signal = */ iree_hal_xrt_lite_semaphore_signal, - /*.fail = */ iree_hal_xrt_lite_semaphore_fail, - /*.wait = */ iree_hal_xrt_lite_semaphore_wait, + .destroy = iree_hal_xrt_lite_semaphore_destroy, + .query = unimplemented_ok_status, + .signal = unimplemented_ok_status, + .fail = unimplemented_ok_void, + .wait = unimplemented_ok_status, }; } // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt index 9d9c40fd1..ca6cec933 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -29,5 +29,7 @@ iree_cc_library( uuid COPTS $<$:-fexceptions -frtti> + DEFINES + $<$:SHIM_XDNA_DEBUG> PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h index 0e9cbd93e..740de7462 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -35,7 +35,7 @@ template template void shim_debug(const char *fmt, Args &&...args) { -#ifndef NDEBUG +#ifdef SHIM_XDNA_DEBUG std::string format{"shim_xdna: "}; format += std::string(fmt); format += "\n"; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h index 92556fcd9..d983f27c1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -19,6 +19,14 @@ void unimplemented(Params...) { IREE_ASSERT(false && "unimplemented"); } +template +iree_status_t unimplemented_ok_status(Params...) { + return iree_ok_status(); +} + +template +void unimplemented_ok_void(Params...) {} + #define MEMBER_WRAPPER(From, To, member, return_t) \ template \ static return_t To##_##member(From* b, Args... args) { \ From ec92dfb3a23b521eda5969b66d439b1d3445010f Mon Sep 17 00:00:00 2001 From: makslevental Date: Sat, 12 Oct 2024 19:18:18 -0400 Subject: [PATCH 14/35] put shim_debug behind define --- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 2 +- .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 2 +- .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 32 +++++++++---------- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 14 ++++---- .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 20 ++++++------ .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 12 +++---- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 14 ++++---- .../xrt-lite/shim/linux/kmq/shim_debug.h | 8 +++-- .../driver/xrt/cts/CMakeLists.txt | 3 ++ 9 files changed, 56 insertions(+), 51 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index 7e4aded8a..df9191a39 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -1159,7 +1159,7 @@ LogicalResult aie2xclbin( } Path pdiPath = tempDirPath / "design.pdi"; - if (failed(generatePDI(pdiPath, tempDirPath))) { + if (failed(generatePDI(pdiPath.string(), tempDirPath))) { llvm::errs() << "Failed to generate PDI\n"; return failure(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt index ca6cec933..bb2c63cfc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -30,6 +30,6 @@ iree_cc_library( COPTS $<$:-fexceptions -frtti> DEFINES - $<$:SHIM_XDNA_DEBUG> +# $<$:SHIM_XDNA_DEBUG> PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index 27723d969..11f0f04df 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -184,7 +184,7 @@ drm_bo::~drm_bo() { try { free_drm_bo(m_parent.m_pdev, m_handle); } catch (const std::system_error &e) { - shim_debug("Failed to free DRM BO: %s", e.what()); + SHIM_DEBUG("Failed to free DRM BO: %s", e.what()); } } @@ -247,7 +247,7 @@ void bo::mmap_bo(size_t align) { } void bo::munmap_bo() { - shim_debug("Unmap BO, aligned %p parent %p", m_aligned, m_parent); + SHIM_DEBUG("Unmap BO, aligned %p parent %p", m_aligned, m_parent); if (m_drm_bo->m_map_offset == AMDXDNA_INVALID_ADDR) return; unmap_drm_bo(m_pdev, m_aligned, m_aligned_size); @@ -309,30 +309,30 @@ bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, #ifndef NDEBUG switch (m_flags.all) { case 0x0: - shim_debug("allocating dev heap"); + SHIM_DEBUG("allocating dev heap"); break; case 0x1000000: // pdi bo - shim_debug("allocating pdi bo"); + SHIM_DEBUG("allocating pdi bo"); break; case 0x20000000: // XCL_BO_FLAGS_P2P in create_free_bo test - shim_debug("allocating XCL_BO_FLAGS_P2P"); + SHIM_DEBUG("allocating XCL_BO_FLAGS_P2P"); break; case 0x80000000: // XCL_BO_FLAGS_EXECBUF in create_free_bo test - shim_debug("allocating XCL_BO_FLAGS_EXECBUF"); + SHIM_DEBUG("allocating XCL_BO_FLAGS_EXECBUF"); break; case 0x1001000000: // debug bo - shim_debug("allocating debug bo"); + SHIM_DEBUG("allocating debug bo"); break; default: shim_err(-1, "unknown flags %d", flags); } #endif - shim_debug( + SHIM_DEBUG( "Allocated KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, " "type=%d, drm_bo=%d)", m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); @@ -341,14 +341,14 @@ bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, bo::bo(const pdev &p, int ehdl) : m_pdev(p), m_import(ehdl) { import_bo(); mmap_bo(); - shim_debug( + SHIM_DEBUG( "Imported KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " "drm_bo=%d)", m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); } bo::~bo() { - shim_debug("Freeing KMQ BO, %s", describe().c_str()); + SHIM_DEBUG("Freeing KMQ BO, %s", describe().c_str()); munmap_bo(); try { @@ -356,7 +356,7 @@ bo::~bo() { // If BO is in use, we should block and wait in driver free_bo(); } catch (const std::system_error &e) { - shim_debug("Failed to free BO: %s", e.what()); + SHIM_DEBUG("Failed to free BO: %s", e.what()); } } @@ -389,7 +389,7 @@ void bo::attach_to_ctx() { if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; auto boh = get_drm_bo_handle(); - shim_debug("Attaching drm_bo %d to ctx: %d", boh, m_owner_ctx_id); + SHIM_DEBUG("Attaching drm_bo %d to ctx: %d", boh, m_owner_ctx_id); attach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); } @@ -397,14 +397,14 @@ void bo::detach_from_ctx() { if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; auto boh = get_drm_bo_handle(); - shim_debug("Detaching drm_bo %d from ctx: %d", boh, m_owner_ctx_id); + SHIM_DEBUG("Detaching drm_bo %d from ctx: %d", boh, m_owner_ctx_id); detach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); } std::unique_ptr bo::share() const { auto boh = get_drm_bo_handle(); auto fd = export_drm_bo(m_pdev, boh); - shim_debug("Exported bo %d to fd %d", boh, fd); + SHIM_DEBUG("Exported bo %d to fd %d", boh, fd); return std::make_unique(fd); } @@ -449,7 +449,7 @@ void bo::bind_at(size_t pos, const bo &boh, size_t offset, size_t size) { if (boh.get_type() != AMDXDNA_BO_CMD) { auto h = boh.get_drm_bo_handle(); m_args_map[pos] = h; - shim_debug("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); + SHIM_DEBUG("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); } else { const size_t max_args_order = 6; const size_t max_args = 1 << max_args_order; @@ -461,7 +461,7 @@ void bo::bind_at(size_t pos, const bo &boh, size_t offset, size_t size) { m_args_map[key + i] = hs[i]; bohs += std::to_string(hs[i]) + " "; } - shim_debug("Added arg BO %s to cmd BO %d", bohs.c_str(), + SHIM_DEBUG("Added arg BO %s to cmd BO %d", bohs.c_str(), get_drm_bo_handle()); } } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 458f59305..dae7ca0d5 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -112,19 +112,19 @@ pdev::pdev() { // TODO(max): hardcoded m_dev_fd = ::open("/dev/accel/accel0", O_RDWR); if (m_dev_fd < 0) shim_err(EINVAL, "Failed to open KMQ device"); - shim_debug("Device opened, fd=%d", m_dev_fd); + SHIM_DEBUG("Device opened, fd=%d", m_dev_fd); m_dev_heap_bo = std::make_unique(*this, dev_mem_size, AMDXDNA_BO_DEV_HEAP); - shim_debug("Created KMQ pcidev"); + SHIM_DEBUG("Created KMQ pcidev"); } pdev::~pdev() { - shim_debug("Destroying KMQ pcidev"); + SHIM_DEBUG("Destroying KMQ pcidev"); const std::lock_guard lock(m_lock); m_dev_heap_bo.reset(); ::close(m_dev_fd); - shim_debug("Device closed, fd=%d", m_dev_fd); - shim_debug("Destroyed KMQ pcidev"); + SHIM_DEBUG("Device closed, fd=%d", m_dev_fd); + SHIM_DEBUG("Destroyed KMQ pcidev"); } void pdev::ioctl(unsigned long cmd, void *arg) const { @@ -143,9 +143,9 @@ void *pdev::mmap(void *addr, size_t len, int prot, int flags, return ret; } -device::device() { shim_debug("Created KMQ device"); } +device::device() { SHIM_DEBUG("Created KMQ device"); } -device::~device() { shim_debug("Destroying KMQ device"); } +device::~device() { SHIM_DEBUG("Destroying KMQ device"); } const pdev &device::get_pdev() const { return m_pdev; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp index 55df59e27..ba48b3f9e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -137,14 +137,14 @@ fence_handle::fence_handle(const device &device) : m_pdev(device.get_pdev()), m_import(std::make_unique(-1)), m_syncobj_hdl(create_syncobj(m_pdev)) { - shim_debug("Fence allocated: %d@%d", m_syncobj_hdl, m_state); + SHIM_DEBUG("Fence allocated: %d@%d", m_syncobj_hdl, m_state); } fence_handle::fence_handle(const device &device, int ehdl) : m_pdev(device.get_pdev()), m_import(std::make_unique(ehdl)), m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())) { - shim_debug("Fence imported: %d@%ld", m_syncobj_hdl, m_state); + SHIM_DEBUG("Fence imported: %d@%ld", m_syncobj_hdl, m_state); } fence_handle::fence_handle(const fence_handle &f) @@ -153,15 +153,15 @@ fence_handle::fence_handle(const fence_handle &f) m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())), m_signaled{f.m_signaled}, m_state{f.m_state} { - shim_debug("Fence cloned: %d@%ld", m_syncobj_hdl, m_state); + SHIM_DEBUG("Fence cloned: %d@%ld", m_syncobj_hdl, m_state); } fence_handle::~fence_handle() { - shim_debug("Fence going away: %d@%ld", m_syncobj_hdl, m_state); + SHIM_DEBUG("Fence going away: %d@%ld", m_syncobj_hdl, m_state); try { destroy_syncobj(m_pdev, m_syncobj_hdl); } catch (const std::system_error &e) { - shim_debug("Failed to destroy fence_handle"); + SHIM_DEBUG("Failed to destroy fence_handle"); } } @@ -190,13 +190,13 @@ uint64_t fence_handle::wait_next_state() const { // Timeout value is ignored for now. void fence_handle::wait(uint32_t timeout_ms) const { auto st = signal_next_state(); - shim_debug("Waiting for command fence_handle %d@%ld", m_syncobj_hdl, st); + SHIM_DEBUG("Waiting for command fence_handle %d@%ld", m_syncobj_hdl, st); wait_syncobj_done(m_pdev, m_syncobj_hdl, st); } void fence_handle::submit_wait(const hw_ctx *ctx) const { auto st = signal_next_state(); - shim_debug("Submitting wait for command fence_handle %d@%ld", m_syncobj_hdl, + SHIM_DEBUG("Submitting wait for command fence_handle %d@%ld", m_syncobj_hdl, st); submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1); } @@ -212,13 +212,13 @@ uint64_t fence_handle::signal_next_state() const { void fence_handle::signal() const { auto st = signal_next_state(); - shim_debug("Signaling command fence_handle %d@%ld", m_syncobj_hdl, st); + SHIM_DEBUG("Signaling command fence_handle %d@%ld", m_syncobj_hdl, st); signal_syncobj(m_pdev, m_syncobj_hdl, st); } void fence_handle::submit_signal(const hw_ctx *ctx) const { auto st = signal_next_state(); - shim_debug("Submitting signal command fence_handle %d@%ld", m_syncobj_hdl, + SHIM_DEBUG("Submitting signal command fence_handle %d@%ld", m_syncobj_hdl, st); submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st); } @@ -237,7 +237,7 @@ void fence_handle::submit_wait( for (auto f : fences) { auto fh = static_cast(f); auto st = fh->wait_next_state(); - shim_debug("Waiting for command fence_handle %d@%ld", fh->m_syncobj_hdl, + SHIM_DEBUG("Waiting for command fence_handle %d@%ld", fh->m_syncobj_hdl, st); hdls[i] = fh->m_syncobj_hdl; pts[i] = st; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index f362a2f80..c7fbc0e7e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -16,7 +16,7 @@ hw_ctx::hw_ctx(device &dev, const std::map &qos, std::unique_ptr q, const std::vector &pdi, const std::string &cu_name, size_t functional) : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { - shim_debug("Creating HW context..."); + SHIM_DEBUG("Creating HW context..."); for (auto &[key, value] : qos) { if (key == "gops") @@ -78,23 +78,23 @@ hw_ctx::hw_ctx(device &device, const std::vector &pdi, arg.param_val_size = cu_conf_param_buf.size(); m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &arg); - shim_debug("Created KMQ HW context (%d)", m_handle); + SHIM_DEBUG("Created KMQ HW context (%d)", m_handle); } hw_ctx::~hw_ctx() { try { delete_ctx_on_device(); } catch (const std::system_error &e) { - shim_debug("Failed to delete context on device: %s", e.what()); + SHIM_DEBUG("Failed to delete context on device: %s", e.what()); } - shim_debug("Destroyed HW context (%d)...", m_handle); - shim_debug("Destroying KMQ HW context (%d)...", m_handle); + SHIM_DEBUG("Destroyed HW context (%d)...", m_handle); + SHIM_DEBUG("Destroying KMQ HW context (%d)...", m_handle); } cuidx_t hw_ctx::open_cu_context(const std::string &cu_name) { for (uint32_t i = 0; i < m_cu_info.size(); i++) { auto &ci = m_cu_info[i]; - shim_debug("ci.m_name %s", ci.m_name.c_str()); + SHIM_DEBUG("ci.m_name %s", ci.m_name.c_str()); if (ci.m_name == cu_name) return cuidx_t{.index = i}; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp index 868b2eb0e..22a29549b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -20,7 +20,7 @@ int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, int ret = 1; auto id = cmd->get_cmd_id(); - shim_xdna::shim_debug("Waiting for cmd (%ld)...", id); + SHIM_DEBUG("Waiting for cmd (%ld)...", id); amdxdna_drm_wait_cmd wcmd = { .hwctx = ctx->m_handle, @@ -45,16 +45,16 @@ hw_q::hw_q(const device &device) : m_hwctx(nullptr), m_pdev(device.get_pdev()), m_queue_boh(AMDXDNA_INVALID_BO_HANDLE) { - shim_debug("Created KMQ HW queue"); + SHIM_DEBUG("Created KMQ HW queue"); } void hw_q::bind_hwctx(const hw_ctx *ctx) { m_hwctx = ctx; - shim_debug("Bond HW queue to HW context %d", m_hwctx->m_handle); + SHIM_DEBUG("Bond HW queue to HW context %d", m_hwctx->m_handle); } void hw_q::unbind_hwctx() { - shim_debug("Unbond HW queue from HW context %d", m_hwctx->m_handle); + SHIM_DEBUG("Unbond HW queue from HW context %d", m_hwctx->m_handle); m_hwctx = nullptr; } @@ -71,9 +71,7 @@ void hw_q::submit_wait(const std::vector &fences) { void hw_q::submit_signal(const fence_handle *f) { f->submit_signal(m_hwctx); } -hw_q::~hw_q() { - shim_debug("Destroying KMQ HW queue"); -} +hw_q::~hw_q() { SHIM_DEBUG("Destroying KMQ HW queue"); } void hw_q::issue_command(bo *cmd_bo) { // Assuming 1024 max args per cmd bo @@ -94,7 +92,7 @@ void hw_q::issue_command(bo *cmd_bo) { auto id = ecmd.seq; cmd_bo->set_cmd_id(id); - shim_debug("Submitted command (%ld)", id); + SHIM_DEBUG("Submitted command (%ld)", id); } int poll_command(bo *cmd) { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h index 740de7462..bf853312e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -35,14 +35,18 @@ template template void shim_debug(const char *fmt, Args &&...args) { -#ifdef SHIM_XDNA_DEBUG std::string format{"shim_xdna: "}; format += std::string(fmt); format += "\n"; debugf(format.c_str(), std::forward(args)...); -#endif } } // namespace shim_xdna +#ifdef SHIM_XDNA_DEBUG +#define SHIM_DEBUG(...) shim_xdna::shim_debug(__VA_ARGS__) +#else +#define SHIM_DEBUG(...) +#endif + #endif // SHIM_DEBUG_H diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt index 07746787d..e068c08e3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt @@ -43,6 +43,9 @@ iree_bytecode_module( FLAGS --compile-mode=hal-executable --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} + # on windows iree-aie-xclbinutil for some reason isn't found by iree's findTool + # so set this instead to the bin dir + --iree-amd-aie-install-dir=${CMAKE_BINARY_DIR} --iree-hal-target-backends=amd-aie --iree-amdaie-lower-to-aie-pipeline=air --iree-amdaie-target-device=${TARGET_DEVICE} From 63e917d9b94d133a54bd6e0225c460640190f7ff Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Sat, 12 Oct 2024 23:03:27 -0400 Subject: [PATCH 15/35] parameterize tests with device-hal --- .github/workflows/ci-linux.yml | 7 ++-- .github/workflows/ci-windows.yml | 4 +-- build_tools/ci/cpu_comparison/run.py | 22 +++++++++--- build_tools/ci/run_matmul_test.sh | 35 ++++++++++++------- build_tools/download_peano.ps1 | 2 +- .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 2 +- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 3 +- tests/conftest.py | 18 ++++++---- 8 files changed, 62 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 44a3eb03b..ecabe4267 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -168,7 +168,8 @@ jobs: $PWD/iree-install \ $PWD/llvm-aie \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v + --reset-npu-between-runs -v \ + --device-hal=xrt-lite - name: E2E correctness matmul test run: | @@ -177,6 +178,7 @@ jobs: # which can fail if limit is to low sudo prlimit -lunlimited --pid $$ source .venv/bin/activate + export DEVICE_HAL=xrt-lite bash build_tools/ci/run_matmul_test.sh \ test_matmuls \ iree-install \ @@ -189,7 +191,8 @@ jobs: pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie + --peano-install-dir=$PWD/llvm-aie \ + --device-hal=xrt-lite - name: XRT-LITE tests run: | diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index d45f22396..2684db1ca 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -198,5 +198,5 @@ jobs: mkdir temp pytest tests -sv ` --basetemp=$PWD\temp ` - --iree-install-dir="$PWD/iree-install" ` - --peano-install-dir="$PWD/llvm-aie" + --iree-install-dir="$PWD\iree-install" ` + --peano-install-dir="$PWD\llvm-aie" diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 5875bce25..3b87934c9 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -2,18 +2,19 @@ # Copyright 2024 The IREE Authors -import sys import argparse import os import platform import re import subprocess +import sys import time from pathlib import Path from textwrap import dedent import numpy as np +from convolution_template.convolution_generator import ConvolutionMlirGenerator from input_generator import ( generate_inputs, verify_determinism, @@ -22,7 +23,6 @@ np_from_binfile, ) from matmul_template.matmul_generator import generate_matmul_test -from convolution_template.convolution_generator import ConvolutionMlirGenerator from output_comparer import compare @@ -146,7 +146,7 @@ def generate_aie_vmfb( f"--iree-amd-aie-install-dir={config.iree_install_dir}", f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}", f"--iree-hal-dump-executable-files-to={config.output_dir}", - "--iree-amdaie-device-hal=xrt-lite", + f"--iree-amdaie-device-hal={config.device_hal}", "--iree-scheduling-optimize-bindings=false", "--iree-hal-memoization=false", "--iree-hal-indirect-command-buffers=false", @@ -192,7 +192,7 @@ def generate_aie_output(config, aie_vmfb, input_args, function_name, name, outpu config.iree_run_exe, f"--module={aie_vmfb}", *input_args, - "--device=xrt-lite", + f"--device={config.device_hal}", f"--output=@{aie_bin}", ] if function_name: @@ -268,6 +268,7 @@ def __init__( reset_npu_between_runs, do_not_run_aie, additional_aie_compilation_flags, + device_hal, ): self.output_dir = output_dir self.iree_install_dir = iree_install_dir @@ -284,6 +285,7 @@ def __init__( self.reset_npu_between_runs = reset_npu_between_runs self.do_not_run_aie = do_not_run_aie self.additional_aie_compilation_flags = additional_aie_compilation_flags + self.device_hal = device_hal # Try get the xrt and (linux) kernel versions. self.linux_kernel = "undetermined" @@ -847,6 +849,7 @@ def all_tests( do_not_run_aie, test_set, additional_aie_compilation_flags, + device_hal ): """ There are a few ways to add tests to this script: @@ -888,6 +891,7 @@ def all_tests( reset_npu_between_runs, do_not_run_aie, additional_aie_compilation_flags, + device_hal ) if verbose: print(config) @@ -1024,6 +1028,15 @@ def all_tests( default="", ) + parser.add_argument( + "--device-hal", + default="xrt", + const="xrt", + nargs="?", + choices=["xrt", "xrt-lite"], + help="device HAL to use (default: %(default)s)", + ) + args = parser.parse_args() test_set_list = args.test_set.split(",") @@ -1039,4 +1052,5 @@ def all_tests( args.do_not_run_aie, test_set_list, args.additional_aie_compilation_flags, + args.device_hal ) diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 5ced68435..8c9b79074 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -28,16 +28,16 @@ if [ "$#" -lt 2 ] || [ "$#" -gt 5 ]; then # The expected parameters are # 1) (required) # 2) (required) - # 4) (optional) + # 3) (optional) + # 4) (optional) # 5) (optional) - # 6) (optional) echo -e "Illegal number of parameters: $#, expected 2-5 parameters." \ "\n The parameters are as follows:" \ "\n 1) (required)" \ "\n 2) (required)" \ "\n 3) (optional)" \ - "\n 4) (optional)" \ - "\n 5) (optional)" \ + "\n 4) (optional)" \ + "\n 5) (optional)" \ "\n Example, dependent on environment variables:" \ "\n ./run_matmul_test.sh " \ "results_dir_tmp \$IREE_INSTALL_DIR " \ @@ -109,6 +109,17 @@ else VITIS=`realpath "$4"` fi +# Parameter 5) +if [ -z "${5-}" ]; then + XRT_DIR=/opt/xilinx/xrt +else + XRT_DIR=`realpath "$5"` +fi +if [ -f "$XRT_DIR/setup.sh" ]; then + source $XRT_DIR/setup.sh +fi + + THIS_DIR="$(cd $(dirname $0) && pwd)" ROOT_DIR="$(cd $THIS_DIR/../.. && pwd)" @@ -129,11 +140,16 @@ fi GITHUB_ACTIONS="${GITHUB_ACTIONS:-false}" +# Circumvent xclbin security (no longer needed as of April 2024 XDNA driver) +export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 + cd ${OUTPUT_DIR} export MATMUL_TESTS_RUN=0 export MATMUL_TESTS_FAILS=0 +DEVICE_HAL="${DEVICE_HAL:-xrt}" + ############################################################################### # Define helper function # ############################################################################### @@ -163,8 +179,6 @@ function run_matmul_test() { local target_device="npu1_4col" - local device="xrt-lite" - local peano_install_path="${PEANO}" local amd_aie_install_path="${IREE_INSTALL_DIR}" @@ -261,10 +275,6 @@ function run_matmul_test() { target_backend="$2" shift 2 ;; - --device) - device="$2" - shift 2 - ;; --peano_install_path) peano_install_path="$2" shift 2 @@ -392,7 +402,7 @@ function run_matmul_test() { --iree-amd-aie-enable-chess=${use_chess} \ --iree-amdaie-enable-packet-flow=${enable_packet_flow} \ --iree-hal-dump-executable-files-to=$PWD \ - --iree-amdaie-device-hal=xrt-lite \ + --iree-amdaie-device-hal=${DEVICE_HAL} \ --iree-hal-memoization=false \ --iree-hal-indirect-command-buffers=false \ --mlir-elide-resource-strings-if-larger=10 \ @@ -451,7 +461,7 @@ function run_matmul_test() { COMMAND="${TEST_RUNNER} \ --module=${matmul_vmfb} \ --module=${calls_vmfb} \ - --device=${device} \ + --device=${DEVICE_HAL} \ --max_elements_to_check=${max_elements_to_check}" total_num_runs=$(( num_repeat_runs * num_corruption_repeat_runs)) @@ -518,7 +528,6 @@ run_matmul_test \ --acc_type "f32" \ --target_backend "amd-aie" \ --target_device "npu1_4col" \ - --device "xrt-lite" \ --peano_install_path "${PEANO}" \ --amd_aie_install_path "${IREE_INSTALL_DIR}" \ --vitis_path "${VITIS}" \ diff --git a/build_tools/download_peano.ps1 b/build_tools/download_peano.ps1 index 6589cc562..74a1240d8 100644 --- a/build_tools/download_peano.ps1 +++ b/build_tools/download_peano.ps1 @@ -12,4 +12,4 @@ pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/e $peano = (Get-ChildItem -Filter llvm*.whl) $new_name = ($peano.Basename + ".zip") Rename-Item -Path $peano.Name -NewName $new_name -Expand-Archive $new_name -DestinationPath $PWD.Path +Expand-Archive $new_name -DestinationPath $PWD.Path -Force diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt index bb2c63cfc..ca6cec933 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -30,6 +30,6 @@ iree_cc_library( COPTS $<$:-fexceptions -frtti> DEFINES -# $<$:SHIM_XDNA_DEBUG> + $<$:SHIM_XDNA_DEBUG> PUBLIC ) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index dae7ca0d5..153e11a88 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -270,8 +270,7 @@ std::filesystem::path find_npu_device() { auto rel = std::filesystem::relative(actual_path, "/sys/devices"); if (!rel.empty() && rel.native()[0] != '.') return absolute(actual_path); } - std::cerr << "No npu device found" << std::endl; - exit(-1); + shim_err(errno, "No npu device found"); } } // namespace shim_xdna diff --git a/tests/conftest.py b/tests/conftest.py index 3b1518c21..a31265dfb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,14 +3,13 @@ import numpy as np import pytest -from ml_dtypes import bfloat16 - from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry from iree.compiler.api import Session, Output, Source, _initializeGlobalCL from iree.compiler.extras import types as T from iree.runtime import VmModule from iree.runtime import get_driver, Config, SystemContext +from ml_dtypes import bfloat16 for t in [ "i8", @@ -47,6 +46,13 @@ def pytest_addoption(parser): parser.addoption("--output-dir", type=abs_path) parser.addoption("--vitis-dir", type=abs_path) parser.addoption("--iree-aie-debug", action="store_true") + parser.addoption( + "--device-hal", + default="xrt", + const="xrt", + nargs="?", + choices=["xrt", "xrt-lite"], + ) @pytest.fixture(scope="session") @@ -80,7 +86,7 @@ def iree_session(request, pytestconfig, global_cl_args) -> Session: f"--iree-amd-aie-install-dir={pytestconfig.option.iree_install_dir}", f"--iree-amd-aie-enable-chess={use_chess}", f"--iree-amdaie-enable-packet-flow={enable_packet_flow}", - "--iree-amdaie-device-hal=xrt-lite", + f"--iree-amdaie-device-hal={pytestconfig.option.device_hal}", ] if pytestconfig.option.vitis_dir: flags += [f"--iree-amd-aie-vitis-install-dir={pytestconfig.option.vitis_dir}"] @@ -99,7 +105,7 @@ def iree_session(request, pytestconfig, global_cl_args) -> Session: @pytest.fixture -def session_module(iree_session, tmp_path) -> ir.Module: +def session_module(iree_session) -> ir.Module: with ir.Location.unknown(iree_session.context): module_op = ir.Module.create() with ir.InsertionPoint(module_op.body): @@ -107,8 +113,8 @@ def session_module(iree_session, tmp_path) -> ir.Module: @pytest.fixture(scope="session") -def device(device="xrt-lite") -> ir.Module: - yield get_driver(device).create_default_device() +def device(pytestconfig) -> ir.Module: + yield get_driver(pytestconfig.option.device_hal).create_default_device() @contextmanager From 9310e7746e045e9be02702b3d683f588cda670d2 Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 15 Oct 2024 11:16:23 -0400 Subject: [PATCH 16/35] make xrt-lite default --- .github/workflows/ci-linux.yml | 3 --- .github/workflows/ci-windows.yml | 7 +++++-- build_tools/ci/cpu_comparison/run.py | 4 ++-- build_tools/ci/run_matmul_test.sh | 2 +- tests/conftest.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index ecabe4267..392adb020 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -169,7 +169,6 @@ jobs: $PWD/llvm-aie \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ --reset-npu-between-runs -v \ - --device-hal=xrt-lite - name: E2E correctness matmul test run: | @@ -178,7 +177,6 @@ jobs: # which can fail if limit is to low sudo prlimit -lunlimited --pid $$ source .venv/bin/activate - export DEVICE_HAL=xrt-lite bash build_tools/ci/run_matmul_test.sh \ test_matmuls \ iree-install \ @@ -192,7 +190,6 @@ jobs: --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ --peano-install-dir=$PWD/llvm-aie \ - --device-hal=xrt-lite - name: XRT-LITE tests run: | diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 2684db1ca..86b5f4f8a 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -177,6 +177,7 @@ jobs: shell: bash run: | source .venv/Scripts/activate + export DEVICE_HAL=xrt bash build_tools/ci/run_matmul_test.sh \ /c/test_matmuls \ $PWD/iree-install \ @@ -189,7 +190,8 @@ jobs: python build_tools/ci/cpu_comparison/run.py \ /c/test_aie_vs_cpu \ $PWD/iree-install \ - $PWD/llvm-aie -v + $PWD/llvm-aie -v \ + --device-hal=xrt - name: Python tests run: | @@ -199,4 +201,5 @@ jobs: pytest tests -sv ` --basetemp=$PWD\temp ` --iree-install-dir="$PWD\iree-install" ` - --peano-install-dir="$PWD\llvm-aie" + --peano-install-dir="$PWD\llvm-aie" ` + --device-hal=xrt diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 3b87934c9..8ddd22ddc 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1030,8 +1030,8 @@ def all_tests( parser.add_argument( "--device-hal", - default="xrt", - const="xrt", + default="xrt-lite", + const="xrt-lite", nargs="?", choices=["xrt", "xrt-lite"], help="device HAL to use (default: %(default)s)", diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 8c9b79074..c1c5a6d56 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -148,7 +148,7 @@ cd ${OUTPUT_DIR} export MATMUL_TESTS_RUN=0 export MATMUL_TESTS_FAILS=0 -DEVICE_HAL="${DEVICE_HAL:-xrt}" +DEVICE_HAL="${DEVICE_HAL:-xrt-lite}" ############################################################################### # Define helper function # diff --git a/tests/conftest.py b/tests/conftest.py index a31265dfb..31c12bb7c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,8 +48,8 @@ def pytest_addoption(parser): parser.addoption("--iree-aie-debug", action="store_true") parser.addoption( "--device-hal", - default="xrt", - const="xrt", + default="xrt-lite", + const="xrt-lite", nargs="?", choices=["xrt", "xrt-lite"], ) From 01787d8eacd90539d86bf72e40b56fc6d42fafd6 Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 15 Oct 2024 12:08:50 -0400 Subject: [PATCH 17/35] remove null comments --- .github/workflows/ci-linux.yml | 4 +- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 51 +---------------- .../src/iree-amd-aie/driver/xrt-lite/api.h | 12 +--- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 28 +-------- .../iree-amd-aie/driver/xrt-lite/device.cc | 16 ------ .../iree-amd-aie/driver/xrt-lite/driver.cc | 57 +------------------ .../xrt-lite/registration/driver_module.c | 8 +-- 7 files changed, 13 insertions(+), 163 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 392adb020..44a3eb03b 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -168,7 +168,7 @@ jobs: $PWD/iree-install \ $PWD/llvm-aie \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v \ + --reset-npu-between-runs -v - name: E2E correctness matmul test run: | @@ -189,7 +189,7 @@ jobs: pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie \ + --peano-install-dir=$PWD/llvm-aie - name: XRT-LITE tests run: | diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index cde48e95b..b98e52f02 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -11,8 +11,6 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -// TODO(null): use one ID per address space or pool - each shows as a different -// track in tracing tools. #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING static const char* IREE_HAL_XRT_LITE_ALLOCATOR_ID = "XRT-LITE unpooled"; #endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING @@ -33,11 +31,6 @@ struct iree_hal_xrt_lite_allocator { IREE_TRACE_ZONE_BEGIN(z0); iree_hal_resource_initialize(&iree_hal_xrt_lite_allocator_vtable, &this->resource); - // TODO(null): query device heaps, supported features (concurrent - // access/etc), and prepare any pools that will be used during allocation. - // It's expected that most failures that occur after creation are allocation - // request-specific so preparing here will help keep the errors more - // localized. IREE_TRACE_ZONE_END(z0); } @@ -45,15 +38,6 @@ struct iree_hal_xrt_lite_allocator { iree_hal_buffer_compatibility_t query_buffer_compatibility( iree_hal_buffer_params_t* params, iree_device_size_t* allocation_size) { - // TODO(null): set compatibility rules based on the implementation. - // Note that the user may have requested that the allocator place the - // allocation based on whatever is optimal for the indicated usage by - // including the IREE_HAL_MEMORY_TYPE_OPTIMAL flag. It's still required that - // the implementation meet all the requirements but it is free to place it - // in either host or device memory so long as the appropriate bits are - // updated to indicate where it landed. - (void)this; - // All buffers can be allocated on the heap. iree_hal_buffer_compatibility_t compatibility = IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; @@ -99,25 +83,7 @@ struct iree_hal_xrt_lite_allocator { "allocator cannot allocate a buffer with the given parameters"); } - // TODO(null): allocate the underlying device memory. - // The impl_ptr is just used for accounting and can be an opaque value - // (handle/etc) so long as it is consistent between the alloc and free and - // unique to the buffer while it is live. An example - // iree_hal_xrt_lite_buffer_wrap is provided that can be used for - // implementations that are managing memory using underlying allocators and - // just wrapping those device pointers in the HAL buffer type. Other - // implementations that require more tracking can provide their own buffer - // types that do such tracking for them. - uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; - // if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { - // flags = XCL_BO_FLAGS_CACHEABLE; - // } else if (iree_all_bits_set(params->type, - // IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE)) { - // // TODO(max): the test here isn't specific enough - // flags = XCL_BO_FLAGS_EXECBUF; - // } - std::unique_ptr bo = shim_device->alloc_bo(allocation_size, flags); iree_hal_buffer_t* buffer = nullptr; @@ -129,9 +95,8 @@ struct iree_hal_xrt_lite_allocator { iree_hal_buffer_release_callback_null(), this->host_allocator, &buffer); if (iree_status_is_ok(status)) { - // TODO(null): ensure this accounting is balanced in deallocate_buffer. - // IREE_TRACE_ALLOC_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr, - // allocation_size); + IREE_TRACE_ALLOC_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr, + allocation_size); IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( &this->statistics, compat_params.type, allocation_size)); *out_buffer = buffer; @@ -142,19 +107,9 @@ struct iree_hal_xrt_lite_allocator { } void deallocate_buffer(iree_hal_buffer_t* base_buffer) { - // TODO(null): free the underlying device memory here. Buffers allocated - // from this allocator will call this method to handle cleanup. Note that - // because this method is responsible for doing the base - // iree_hal_buffer_destroy and the caller assumes the memory has been freed - // an implementation could pool the buffer handle and return it in the - // future. - - // TODO(null): if the buffer was imported then this accounting may need to - // be conditional depending on the implementation. bool was_imported = false; if (!was_imported) { - // TODO(max): - // IREE_TRACE_FREE_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr); + IREE_TRACE_FREE_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr); IREE_STATISTICS(iree_hal_allocator_statistics_record_free( &this->statistics, iree_hal_buffer_memory_type(base_buffer), iree_hal_buffer_allocation_size(base_buffer))); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index 08c760682..9cc04cf46 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -14,20 +14,12 @@ extern "C" { #endif // __cplusplus -// Must be initialized with iree_hal_xrt_lite_device_options_initialize prior to -// use. -struct iree_hal_xrt_lite_device_options_t { - // TODO(null): options for initializing a device such as hardware identifiers, - // implementation mode switches, and debugging control. -}; +struct iree_hal_xrt_lite_device_options_t {}; -// Initializes |out_params| to default values. IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( struct iree_hal_xrt_lite_device_options_t* out_params); struct iree_hal_xrt_lite_driver_options_t { - // TODO(null): options for initializing the driver such as library search - // paths, version min/max, etc. struct iree_hal_xrt_lite_device_options_t default_device_options; }; @@ -50,7 +42,7 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( iree_allocator_t host_allocator, iree_hal_device_t** out_device); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index a78ee9aa2..09dc19446 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -25,20 +25,13 @@ iree_status_t iree_hal_xrt_lite_buffer::map_range( ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); - // TODO(null): perform mapping as described. Note that local-to-buffer range - // adjustment may be required. The resulting mapping is populated with - // standard information such as contents indicating the host addressable - // memory range of the mapped buffer and implementation-specific information - // if additional resources are required. iree_hal_buffer_emulated_map_range - // can be used by implementations that have no way of providing host - // pointers at a large cost (alloc + device->host transfer on map and - // host->device transfer + dealloc on umap). Try not to use that. void* host_ptr = this->bo->map(); - IREE_ASSERT(host_ptr != nullptr); // Should be guaranteed by previous checks. + // Should be guaranteed by previous checks. + IREE_ASSERT(host_ptr != nullptr); uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; iree_status_t status = this->invalidate_range(local_byte_offset, local_byte_length); - // If we mapped for discard scribble over the bytes. This is not a mandated + // If we mapped for discard, scribble over the bytes. This is not a mandated // behavior but it will make debugging issues easier. Alternatively for heap // buffers we could reallocate them such that ASAN yells, but that would // only work if the entire buffer was discarded. @@ -54,18 +47,12 @@ iree_status_t iree_hal_xrt_lite_buffer::map_range( iree_status_t iree_hal_xrt_lite_buffer::unmap_range( iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { - // TODO(null): reverse of map_range. Note that cache invalidation is - // explicit via invalidate_range and need not be performed here. If using - // emulated mapping this must call iree_hal_buffer_emulated_unmap_range to - // release the transient resources. return this->flush_range(local_byte_offset, local_byte_length); } iree_status_t iree_hal_xrt_lite_buffer::invalidate_range( iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length) { - // TODO(null): invalidate the range if required by the buffer. Writes on the - // device are expected to be visible to the host after this returns. if (IREE_UNLIKELY(!this->bo)) { return iree_make_status( IREE_STATUS_FAILED_PRECONDITION, @@ -78,8 +65,6 @@ iree_status_t iree_hal_xrt_lite_buffer::invalidate_range( iree_status_t iree_hal_xrt_lite_buffer::flush_range( iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length) { - // TODO(null): flush the range if required by the buffer. Writes on the - // host are expected to be visible to the device after this returns. if (IREE_UNLIKELY(!this->bo)) { return iree_make_status( IREE_STATUS_FAILED_PRECONDITION, @@ -113,10 +98,6 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( memory_type, allowed_access, allowed_usage, &iree_hal_xrt_lite_buffer_vtable, &buffer->base); buffer->release_callback = release_callback; - // TODO(null): retain or take ownership of provided handles/pointers/etc. - // Implementations may want to pass in an internal buffer type discriminator - // if there are multiple or use different top-level iree_hal_buffer_t - // implementations. buffer->bo = std::move(bo); *out_buffer = &buffer->base; @@ -130,9 +111,6 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_allocator_t host_allocator = base_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - // Optionally call a release callback when the buffer is destroyed. Not all - // implementations may require this but it's cheap and provides additional - // flexibility. if (buffer->release_callback.fn) { buffer->release_callback.fn(buffer->release_callback.user_data, base_buffer); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index e2c5ac2bc..3e2bdd578 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -41,8 +41,6 @@ struct iree_hal_xrt_lite_device { iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, iree_hal_command_buffer_t** out_command_buffer) { - // TODO(null): pass any additional resources required to create the command - // buffer. The implementation could pool command buffers here. if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unimplmented multi-shot command buffer"); @@ -119,7 +117,6 @@ struct iree_hal_xrt_lite_device { iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, iree_device_size_t allocation_size, iree_hal_buffer_t** IREE_RESTRICT out_buffer) { - // TODO: queue-ordered allocations. IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout())); IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( @@ -136,10 +133,6 @@ extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; void iree_hal_xrt_lite_device_options_initialize( iree_hal_xrt_lite_device_options_t* out_options) { memset(out_options, 0, sizeof(*out_options)); - // TODO(null): set defaults based on compiler configuration. Flags should not - // be used as multiple devices may be configured within the process or the - // hosting application may be authored in python/etc that does not use a flags - // mechanism accessible here. } iree_status_t iree_hal_xrt_lite_device_create( @@ -164,9 +157,6 @@ iree_status_t iree_hal_xrt_lite_device_create( device->host_allocator = host_allocator; device->shim_device = new shim_xdna::device; - // TODO(null): pass device handles and pool configuration to the allocator. - // Some implementations may share allocators across multiple devices created - // from the same driver. iree_status_t status = iree_hal_xrt_lite_allocator_create( host_allocator, device->shim_device, &device->device_allocator); iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, @@ -198,12 +188,6 @@ static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device); IREE_TRACE_ZONE_BEGIN(z0); - // TODO(null): release all implementation resources here. It's expected that - // this is only called once all outstanding resources created with this device - // have been released by the application and no work is outstanding. If the - // implementation performs internal async operations those should be shutdown - // and joined first. - iree_hal_allocator_release(device->device_allocator); delete device->shim_device; iree_allocator_free(host_allocator, device); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index e6a7b5061..743834034 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -28,11 +28,6 @@ static iree_hal_xrt_lite_driver_t* iree_hal_xrt_lite_driver_cast( void iree_hal_xrt_lite_driver_options_initialize( iree_hal_xrt_lite_driver_options_t* out_options) { memset(out_options, 0, sizeof(*out_options)); - - // TODO(null): set defaults based on compiler configuration. Flags should not - // be used as multiple devices may be configured within the process or the - // hosting application may be authored in python/etc that does not use a flags - // mechanism accessible here. iree_hal_xrt_lite_device_options_initialize( &out_options->default_device_options); } @@ -56,28 +51,11 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_append_to_buffer( identifier, &driver->identifier, (char*)driver + total_size - identifier.size); - - // TODO(null): if there are any string fields then they will need to be - // retained as well (similar to the identifier they can be tagged on to the - // end of the driver struct). memcpy(&driver->options, options, sizeof(*options)); - - // TODO(null): load libraries and query driver support from the system. - // Devices need not be enumerated here if doing so is expensive; the - // application may create drivers just to see if they are present but defer - // device enumeration until the user requests one. Underlying implementations - // can sometimes do bonkers static init stuff as soon as they are touched and - // this code may want to do that on-demand instead. - iree_status_t status = iree_ok_status(); - - if (iree_status_is_ok(status)) { - *out_driver = (iree_hal_driver_t*)driver; - } else { - iree_hal_driver_release((iree_hal_driver_t*)driver); - } + *out_driver = (iree_hal_driver_t*)driver; IREE_TRACE_ZONE_END(z0); - return status; + return iree_ok_status(); } static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { @@ -86,8 +64,6 @@ static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { iree_allocator_t host_allocator = driver->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - // TODO(null): if the driver loaded any libraries they should be closed here. - iree_allocator_free(host_allocator, driver); IREE_TRACE_ZONE_END(z0); @@ -99,11 +75,6 @@ static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, iree_host_size_t* out_device_info_count, iree_hal_device_info_t** out_device_infos) { - // TODO(null): query available devices and populate the output. Note that - // unlike most IREE functions this allocates if required in order to allow - // this to return uncached information. Uncached is preferred as it allows - // devices that may come and go (power toggles, user visibility toggles, etc) - // through a process lifetime to appear without needing a full restart. static const iree_hal_device_info_t device_infos[1] = { { .device_id = IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT, @@ -123,20 +94,8 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( iree_allocator_t host_allocator, iree_hal_device_t** out_device) { iree_hal_xrt_lite_driver_t* driver = iree_hal_xrt_lite_driver_cast(base_driver); - - // TODO(null): use the provided params to overwrite the default options. The - // format of the params is implementation-defined. The params strings can be - // directly referenced if needed as the device creation is only allowed to - // access them during the create call below. iree_hal_xrt_lite_device_options_t options = driver->options.default_device_options; - - // TODO(null): implement creation by device_id; this is mostly used as - // query_available_devices->create_device_by_id to avoid needing to expose - // device paths (which may not always be 1:1). This skeleton only has a single - // device so the ID is ignored. - (void)driver; - return iree_hal_xrt_lite_device_create(driver->identifier, &options, host_allocator, out_device); } @@ -148,20 +107,8 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( iree_hal_device_t** out_device) { iree_hal_xrt_lite_driver_t* driver = iree_hal_xrt_lite_driver_cast(base_driver); - - // TODO(null): use the provided params to overwrite the default options. The - // format of the params is implementation-defined. The params strings can be - // directly referenced if needed as the device creation is only allowed to - // access them during the create call below. iree_hal_xrt_lite_device_options_t options = driver->options.default_device_options; - - // TODO(null): support parsing of the device_path. Note that a single driver - // may respond to multiple driver_name queries. Paths are - // implementation-specific and there may be multiple formats; for example, - // device UUID, PCI bus ID, ordinal as used by underlying APIs, etc. - (void)driver; - return iree_hal_xrt_lite_device_create(driver->identifier, &options, host_allocator, out_device); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c index b0ba1c433..d0c82e79f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -12,9 +12,6 @@ static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( void* self, iree_host_size_t* out_driver_info_count, const iree_hal_driver_info_t** out_driver_infos) { - // TODO(null): return multiple drivers if desired. This information must be - // static. The list here is just what is compiled into the binary and not - // expected to actually try to load or initialize drivers. static const iree_hal_driver_info_t default_driver_info = { .driver_name = IREE_SVL("xrt-lite"), .full_name = IREE_SVL("XRT-LITE driver (for AIE)"), @@ -27,16 +24,13 @@ static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { - // TODO(null): use your driver name - this will be the prefix when the user - // specifies the device (`--device=null://foo`). A single driver can support - // multiple prefixes if it wants. if (!iree_string_view_equal(driver_name, IREE_SV("xrt-lite"))) { return iree_make_status(IREE_STATUS_UNAVAILABLE, "no driver '%.*s' is provided by this factory", (int)driver_name.size, driver_name.data); } - // TODO(null): populate options from flags. This driver module file is only + // TODO(max): populate options from flags. This driver module file is only // used in native tools that have access to the flags library. Programmatic // creation of the driver and devices will bypass this file and pass the // options via this struct or key-value string parameters. From a5e6785595912d7c3fe8107201b2cbc90e053d3a Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 15 Oct 2024 13:35:52 -0400 Subject: [PATCH 18/35] refactor device.cc --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 7 - .../iree-amd-aie/driver/xrt-lite/device.cc | 121 ++++++++---------- 2 files changed, 56 insertions(+), 72 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index b98e52f02..cb71c7db2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -11,10 +11,6 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING -static const char* IREE_HAL_XRT_LITE_ALLOCATOR_ID = "XRT-LITE unpooled"; -#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING - namespace { extern const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable; } @@ -95,8 +91,6 @@ struct iree_hal_xrt_lite_allocator { iree_hal_buffer_release_callback_null(), this->host_allocator, &buffer); if (iree_status_is_ok(status)) { - IREE_TRACE_ALLOC_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr, - allocation_size); IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( &this->statistics, compat_params.type, allocation_size)); *out_buffer = buffer; @@ -109,7 +103,6 @@ struct iree_hal_xrt_lite_allocator { void deallocate_buffer(iree_hal_buffer_t* base_buffer) { bool was_imported = false; if (!was_imported) { - IREE_TRACE_FREE_NAMED(IREE_HAL_XRT_LITE_ALLOCATOR_ID, impl_ptr); IREE_STATISTICS(iree_hal_allocator_statistics_record_free( &this->statistics, iree_hal_buffer_memory_type(base_buffer), iree_hal_buffer_allocation_size(base_buffer))); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 3e2bdd578..2d15faaa1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -17,22 +17,43 @@ #define ARENA_BLOCK_SIZE (32 * 1024) +namespace { +extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; +} + struct iree_hal_xrt_lite_device { iree_hal_resource_t resource; iree_string_view_t identifier; - iree_allocator_t host_allocator; + iree_allocator_t host_allocator_; // not used - iree_hal_allocator_t* device_allocator; + iree_hal_allocator_t* device_allocator_; // Block pool used for command buffers with a larger block size (as command // buffers can contain inlined data uploads). iree_arena_block_pool_t block_pool; shim_xdna::device* shim_device; + iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_options_t* options, + iree_allocator_t host_allocator) { + IREE_ASSERT_ARGUMENT(options); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); + this->host_allocator_ = host_allocator; + shim_device = new shim_xdna::device; + + iree_status_t status = iree_hal_xrt_lite_allocator_create( + host_allocator, shim_device, &device_allocator_); + IREE_ASSERT(iree_status_is_ok(status)); + iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, + &block_pool); + IREE_TRACE_ZONE_END(z0); + } + iree_status_t create_executable_cache( iree_string_view_t identifier, iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { return iree_hal_xrt_lite_nop_executable_cache_create( - shim_device, identifier, host_allocator, out_executable_cache); + shim_device, identifier, host_allocator_, out_executable_cache); } iree_status_t create_command_buffer( @@ -46,14 +67,14 @@ struct iree_hal_xrt_lite_device { "unimplmented multi-shot command buffer"); } return iree_hal_deferred_command_buffer_create( - device_allocator, mode, command_categories, binding_capacity, - &block_pool, host_allocator, out_command_buffer); + device_allocator_, mode, command_categories, binding_capacity, + &block_pool, host_allocator_, out_command_buffer); } iree_status_t create_semaphore(uint64_t initial_value, iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) { - return iree_hal_xrt_lite_semaphore_create(host_allocator, initial_value, + return iree_hal_xrt_lite_semaphore_create(host_allocator_, initial_value, out_semaphore); } @@ -74,9 +95,9 @@ struct iree_hal_xrt_lite_device { IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_xrt_lite_direct_command_buffer_create( - shim_device, device_allocator, mode, + shim_device, device_allocator_, mode, IREE_HAL_COMMAND_CATEGORY_ANY, - /*binding_capacity=*/0, &block_pool, host_allocator, + /*binding_capacity=*/0, &block_pool, host_allocator_, &xrt_command_buffer)); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_deferred_command_buffer_apply( @@ -89,8 +110,8 @@ struct iree_hal_xrt_lite_device { void replace_device_allocator(iree_hal_allocator_t* new_allocator) { iree_hal_allocator_retain(new_allocator); - iree_hal_allocator_release(this->device_allocator); - this->device_allocator = new_allocator; + iree_hal_allocator_release(this->device_allocator_); + this->device_allocator_ = new_allocator; } iree_status_t query_i64(iree_string_view_t category, iree_string_view_t key, @@ -120,15 +141,26 @@ struct iree_hal_xrt_lite_device { IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout())); IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( - device_allocator, params, allocation_size, out_buffer)); + device_allocator_, params, allocation_size, out_buffer)); IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list)); return iree_ok_status(); } -}; -namespace { -extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; -} + iree_string_view_t id() { return this->identifier; } + + void destroy() { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_allocator_release(this->device_allocator_); + delete this->shim_device; + iree_allocator_free(host_allocator_, this); + + IREE_TRACE_ZONE_END(z0); + }; + + iree_allocator_t host_allocator() { return this->host_allocator_; } + iree_hal_allocator_t* device_allocator() { return this->device_allocator_; } +}; void iree_hal_xrt_lite_device_options_initialize( iree_hal_xrt_lite_device_options_t* out_options) { @@ -144,25 +176,16 @@ iree_status_t iree_hal_xrt_lite_device_create( *out_device = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = nullptr; + iree_hal_xrt_lite_device* device = + new iree_hal_xrt_lite_device(options, host_allocator); + iree_status_t status = iree_ok_status(); iree_host_size_t total_size = sizeof(*device) + identifier.size; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_allocator_malloc(host_allocator, total_size, - reinterpret_cast(&device))); - iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, - &device->resource); iree_string_view_append_to_buffer( identifier, &device->identifier, reinterpret_cast(device) + total_size - identifier.size); - device->host_allocator = host_allocator; - device->shim_device = new shim_xdna::device; - - iree_status_t status = iree_hal_xrt_lite_allocator_create( - host_allocator, device->shim_device, &device->device_allocator); - iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, - &device->block_pool); // TODO(max): device id *out_device = reinterpret_cast(device); + if (iree_status_is_ok(status)) { } else { iree_hal_device_release(reinterpret_cast(device)); @@ -171,42 +194,6 @@ iree_status_t iree_hal_xrt_lite_device_create( return status; } -static iree_hal_xrt_lite_device* iree_hal_xrt_lite_device_cast( - iree_hal_device_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_device_vtable); - return reinterpret_cast(base_value); -} - -static iree_string_view_t iree_hal_xrt_lite_device_id( - iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); - return device->identifier; -} - -static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); - iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device); - IREE_TRACE_ZONE_BEGIN(z0); - - iree_hal_allocator_release(device->device_allocator); - delete device->shim_device; - iree_allocator_free(host_allocator, device); - - IREE_TRACE_ZONE_END(z0); -}; - -static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( - iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); - return device->host_allocator; -} - -static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( - iree_hal_device_t* base_device) { - iree_hal_xrt_lite_device* device = iree_hal_xrt_lite_device_cast(base_device); - return device->device_allocator; -} - #define DEVICE_MEMBER(member, return_t) \ MEMBER_WRAPPER(iree_hal_device_t, iree_hal_xrt_lite_device, member, return_t) #define DEVICE_MEMBER_STATUS(member) \ @@ -214,8 +201,12 @@ static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( #define DEVICE_MEMBER_VOID(member) \ MEMBER_WRAPPER_VOID(iree_hal_device_t, iree_hal_xrt_lite_device, member) -DEVICE_MEMBER_STATUS(create_executable_cache); +DEVICE_MEMBER(host_allocator, iree_allocator_t); +DEVICE_MEMBER(device_allocator, iree_hal_allocator_t*); +DEVICE_MEMBER(id, iree_string_view_t); +DEVICE_MEMBER_VOID(destroy); DEVICE_MEMBER_STATUS(create_command_buffer); +DEVICE_MEMBER_STATUS(create_executable_cache); DEVICE_MEMBER_STATUS(create_semaphore); DEVICE_MEMBER_STATUS(queue_execute); DEVICE_MEMBER_STATUS(query_i64); From 8aaa1d2cbf84cde7bea4316381c352fcafb2a135 Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 15 Oct 2024 16:31:09 -0400 Subject: [PATCH 19/35] fix iree-benchmark --- .../iree-amd-aie/driver/xrt-lite/device.cc | 3 ++ .../driver/xrt-lite/direct_command_buffer.cc | 27 +++++++------- .../driver/xrt-lite/executable.cc | 37 ++++--------------- .../iree-amd-aie/driver/xrt-lite/executable.h | 26 +++++++------ 4 files changed, 39 insertions(+), 54 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 2d15faaa1..fcab63603 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -227,5 +227,8 @@ const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { .create_semaphore = iree_hal_xrt_lite_device_create_semaphore, .queue_alloca = iree_hal_xrt_lite_device_queue_alloca, .queue_execute = iree_hal_xrt_lite_device_queue_execute, + .profiling_begin = unimplemented_ok_status, + .profiling_flush = unimplemented_ok_status, + .profiling_end = unimplemented_ok_status, }; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index bfcf3c0d2..6733c9a2c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -178,7 +178,7 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_copy_buffer( static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( iree_hal_command_buffer_t* base_command_buffer, - iree_hal_executable_t* executable, int32_t entry_point, + iree_hal_executable_t* base_executable, int32_t entry_point, const uint32_t workgroup_count[3], iree_const_byte_span_t constants, iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) { iree_hal_xrt_lite_direct_command_buffer* command_buffer = @@ -188,34 +188,35 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( // Lookup kernel parameters used for side-channeling additional launch // information from the compiler. - iree_hal_xrt_lite_kernel_params_t kernel_params; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_xrt_lite_native_executable_entry_point_kernel_params( - executable, entry_point, &kernel_params)); + iree_hal_xrt_lite_native_executable_t* executable = + iree_hal_xrt_lite_native_executable_cast(base_executable); + iree_hal_xrt_lite_kernel_params_t kernel_params = + executable->entry_points[entry_point]; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &executable)); - kernel_params.context = command_buffer->shim_device->create_hw_context( - kernel_params.pdiVector, kernel_params.kernel_name); - uint32_t num_instr = flatbuffers_uint32_vec_len(kernel_params.asm_inst); - size_t ctrl_code_size = num_instr * sizeof(uint32_t); + std::unique_ptr context = + command_buffer->shim_device->create_hw_context(kernel_params.pdi, + kernel_params.kernel_name); + size_t ctrl_code_size = kernel_params.asm_inst.size() * sizeof(uint32_t); auto bo_ctrl_code = command_buffer->shim_device->alloc_bo( ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); uint32_t* instr_buffer = static_cast(bo_ctrl_code->map()); - memcpy(instr_buffer, kernel_params.asm_inst, ctrl_code_size); + memcpy(instr_buffer, kernel_params.asm_inst.data(), ctrl_code_size); bo_ctrl_code->sync(shim_xdna::direction::host2device); shim_xdna::cuidx_t cu_idx = - kernel_params.context->open_cu_context(kernel_params.kernel_name); + context->open_cu_context(kernel_params.kernel_name); shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); ebuf.set_cu_idx(cu_idx); unsigned int opcode = 3; ebuf.add_arg_64(opcode); ebuf.add_arg_bo(*bo_ctrl_code); - ebuf.add_arg_32(num_instr); + ebuf.add_arg_32(kernel_params.asm_inst.size()); + for (iree_host_size_t j = 0; j < bindings.count; ++j) { shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); @@ -223,7 +224,7 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( bo->sync(shim_xdna::direction::host2device); } - shim_xdna::hw_q* hwq = kernel_params.context->get_hw_queue(); + shim_xdna::hw_q* hwq = context->get_hw_queue(); hwq->issue_command(ebuf.get_exec_buf_bo()); hwq->wait_command(ebuf.get_exec_buf_bo(), 0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index 08ff85dc0..d55e3f007 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -14,22 +14,13 @@ #include "iree-amd-aie/schemas/pdi_executable_def_verifier.h" #include "iree/base/api.h" -struct iree_hal_xrt_lite_native_executable_t { - // Abstract resource used for injecting reference counting and vtable; must be - // at offset 0. - iree_hal_resource_t resource; - iree_allocator_t host_allocator; - iree_host_size_t entry_point_count; - iree_hal_xrt_lite_kernel_params_t entry_points[16]; -}; - namespace { extern const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable; } // namespace -static iree_hal_xrt_lite_native_executable_t* -iree_hal_xrt_lite_native_executable_cast(iree_hal_executable_t* base_value) { +iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast( + iree_hal_executable_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); return reinterpret_cast(base_value); } @@ -176,14 +167,17 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( std::vector pdiVector(pdi_fb, pdi_fb + flatbuffers_string_len(pdi_fb)); - params->pdiVector = pdiVector; + params->pdi = pdiVector; uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); iree_amd_aie_hal_xrt_lite_AsmInstDef_table_t asminst_def = iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_at(asm_instrs_vec, asm_instr_index); - params->asm_inst = + flatbuffers_uint32_vec_t asm_inst = iree_amd_aie_hal_xrt_lite_AsmInstDef_asm_inst_get(asminst_def); + std::vector asmVector( + asm_inst, asm_inst + flatbuffers_uint32_vec_len(asm_inst)); + params->asm_inst = asmVector; // Stash the entry point name in the string table for use when tracing. IREE_TRACE({ @@ -229,23 +223,6 @@ static void iree_hal_xrt_lite_native_executable_destroy( IREE_TRACE_ZONE_END(z0); } -iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( - iree_hal_executable_t* base_executable, int32_t entry_point, - iree_hal_xrt_lite_kernel_params_t* out_params) { - iree_hal_xrt_lite_native_executable_t* executable = - iree_hal_xrt_lite_native_executable_cast(base_executable); - if (entry_point >= executable->entry_point_count) { - return iree_make_status(IREE_STATUS_OUT_OF_RANGE, - "entry point ordinal %d out of range; executable " - "only contains %" PRIhsz " entry points", - entry_point, executable->entry_point_count); - } - - memcpy(out_params, &executable->entry_points[entry_point], - sizeof(*out_params)); - return iree_ok_status(); -} - namespace { const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable = { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index 57b477b4c..33b84bcc7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -23,16 +23,25 @@ extern "C" { // Object and launch parameters for a compute kernel. struct iree_hal_xrt_lite_kernel_params_t { - std::unique_ptr context; - std::unique_ptr bo_ctrl_code; - std::vector pdiVector; - flatbuffers_uint32_vec_t asm_inst; - // Number of assembly instructions argument to the kernel + std::vector pdi; + std::vector asm_inst; std::string kernel_name; IREE_TRACE(iree_string_view_t source_filename;) IREE_TRACE(uint32_t source_line;) }; +struct iree_hal_xrt_lite_native_executable_t { + // Abstract resource used for injecting reference counting and vtable; must be + // at offset 0. + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + iree_host_size_t entry_point_count; + iree_hal_xrt_lite_kernel_params_t entry_points[16]; +}; + +iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast( + iree_hal_executable_t* base_value); + // |out_executable| must be released by the caller (see // iree_hal_executable_release). iree_status_t iree_hal_xrt_lite_native_executable_create( @@ -40,13 +49,8 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); -// Returns the kernel launch parameters for the given |entry_point|. -iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( - iree_hal_executable_t* executable, int32_t entry_point, - iree_hal_xrt_lite_kernel_params_t* out_params); - #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ From 976c4185c5b310e90efd614c30b02a37fe5b84b0 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 15 Oct 2024 21:45:32 -0400 Subject: [PATCH 20/35] undo OO --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 161 +++++----- .../src/iree-amd-aie/driver/xrt-lite/api.h | 2 +- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 102 ++++--- .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 22 -- .../iree-amd-aie/driver/xrt-lite/device.cc | 276 +++++++++--------- .../driver/xrt-lite/direct_command_buffer.cc | 41 +-- .../driver/xrt-lite/direct_command_buffer.h | 6 +- .../iree-amd-aie/driver/xrt-lite/executable.h | 2 +- .../driver/xrt-lite/nop_executable_cache.h | 4 +- .../driver/xrt-lite/nop_semaphore.h | 2 +- .../src/iree-amd-aie/driver/xrt-lite/util.h | 13 - 11 files changed, 286 insertions(+), 345 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index cb71c7db2..509556dad 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -29,88 +29,93 @@ struct iree_hal_xrt_lite_allocator { &this->resource); IREE_TRACE_ZONE_END(z0); } +}; - ~iree_hal_xrt_lite_allocator() = default; +iree_hal_buffer_compatibility_t query_buffer_compatibility( + iree_hal_allocator_t* base_allocator, iree_hal_buffer_params_t* params, + iree_device_size_t* allocation_size) { + // All buffers can be allocated on the heap. + iree_hal_buffer_compatibility_t compatibility = + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; - iree_hal_buffer_compatibility_t query_buffer_compatibility( - iree_hal_buffer_params_t* params, iree_device_size_t* allocation_size) { - // All buffers can be allocated on the heap. - iree_hal_buffer_compatibility_t compatibility = - IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; + if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + } - if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { - compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + // Buffers can only be used on the queue if they are device visible. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + if (iree_any_bit_set(params->usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; } + } - // Buffers can only be used on the queue if they are device visible. - if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { - if (iree_any_bit_set(params->usage, - IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { - compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; - } - } + // We are now optimal. + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; - // We are now optimal. - params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; + // Guard against the corner case where the requested buffer size is 0. The + // application is unlikely to do anything when requesting a 0-byte buffer; + // but it can happen in real world use cases. So we should at least not + // crash. + if (*allocation_size == 0) *allocation_size = 4; + // Align allocation sizes to 4 bytes so shaders operating on 32 bit types + // can act safely even on buffer ranges that are not naturally aligned. + *allocation_size = iree_host_align(*allocation_size, 4); - // Guard against the corner case where the requested buffer size is 0. The - // application is unlikely to do anything when requesting a 0-byte buffer; - // but it can happen in real world use cases. So we should at least not - // crash. - if (*allocation_size == 0) *allocation_size = 4; - // Align allocation sizes to 4 bytes so shaders operating on 32 bit types - // can act safely even on buffer ranges that are not naturally aligned. - *allocation_size = iree_host_align(*allocation_size, 4); + return compatibility; +} - return compatibility; +iree_status_t allocate_buffer(iree_hal_allocator_t* base_allocator, + const iree_hal_buffer_params_t* params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** out_buffer) { + iree_hal_xrt_lite_allocator* allocator = + reinterpret_cast(base_allocator); + // Coerce options into those required by the current device. + iree_hal_buffer_params_t compat_params = *params; + iree_hal_buffer_compatibility_t compatibility = query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters"); } - iree_status_t allocate_buffer(const iree_hal_buffer_params_t* params, - iree_device_size_t allocation_size, - iree_hal_buffer_t** out_buffer) { - // Coerce options into those required by the current device. - iree_hal_buffer_params_t compat_params = *params; - iree_hal_buffer_compatibility_t compatibility = - this->query_buffer_compatibility(&compat_params, &allocation_size); - if (!iree_all_bits_set(compatibility, - IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { - return iree_make_status( - IREE_STATUS_INVALID_ARGUMENT, - "allocator cannot allocate a buffer with the given parameters"); - } + uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; + std::unique_ptr bo = + allocator->shim_device->alloc_bo(allocation_size, flags); + iree_hal_buffer_t* buffer = nullptr; + iree_status_t status = iree_hal_xrt_lite_buffer_wrap( + std::move(bo), reinterpret_cast(allocator), + compat_params.type, compat_params.access, compat_params.usage, + allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, + iree_hal_buffer_release_callback_null(), allocator->host_allocator, + &buffer); - uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; - std::unique_ptr bo = - shim_device->alloc_bo(allocation_size, flags); - iree_hal_buffer_t* buffer = nullptr; - iree_status_t status = iree_hal_xrt_lite_buffer_wrap( - std::move(bo), reinterpret_cast(this), - compat_params.type, compat_params.access, compat_params.usage, - allocation_size, - /*byte_offset=*/0, /*byte_length=*/allocation_size, - iree_hal_buffer_release_callback_null(), this->host_allocator, &buffer); - - if (iree_status_is_ok(status)) { - IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( - &this->statistics, compat_params.type, allocation_size)); - *out_buffer = buffer; - } else { - iree_hal_buffer_release(buffer); - } - return status; + if (iree_status_is_ok(status)) { + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( + &allocator->statistics, compat_params.type, allocation_size)); + *out_buffer = buffer; + } else { + iree_hal_buffer_release(buffer); } + return status; +} - void deallocate_buffer(iree_hal_buffer_t* base_buffer) { - bool was_imported = false; - if (!was_imported) { - IREE_STATISTICS(iree_hal_allocator_statistics_record_free( - &this->statistics, iree_hal_buffer_memory_type(base_buffer), - iree_hal_buffer_allocation_size(base_buffer))); - } - - iree_hal_buffer_destroy(base_buffer); +void deallocate_buffer(iree_hal_allocator_t* base_allocator, + iree_hal_buffer_t* base_buffer) { + iree_hal_xrt_lite_allocator* allocator = + reinterpret_cast(base_allocator); + bool was_imported = false; + if (!was_imported) { + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( + &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), + iree_hal_buffer_allocation_size(base_buffer))); } -}; + iree_hal_buffer_destroy(base_buffer); +} static iree_hal_xrt_lite_allocator* iree_hal_xrt_lite_allocator_cast( iree_hal_allocator_t* base_value) { @@ -162,28 +167,14 @@ static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( return allocator->host_allocator; } -#define ALLOCATOR_MEMBER(member, return_t) \ - MEMBER_WRAPPER(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, member, \ - return_t) -#define ALLOCATOR_MEMBER_STATUS(member) \ - MEMBER_WRAPPER_STATUS(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, \ - member) -#define ALLOCATOR_MEMBER_VOID(member) \ - MEMBER_WRAPPER_VOID(iree_hal_allocator_t, iree_hal_xrt_lite_allocator, member) - -ALLOCATOR_MEMBER(query_buffer_compatibility, iree_hal_buffer_compatibility_t); -ALLOCATOR_MEMBER_STATUS(allocate_buffer); -ALLOCATOR_MEMBER_VOID(deallocate_buffer); - namespace { const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { .destroy = iree_hal_xrt_lite_allocator_destroy, .host_allocator = iree_hal_xrt_lite_allocator_host_allocator, .trim = unimplemented_ok_status, .query_statistics = unimplemented_ok_void, - .query_buffer_compatibility = - iree_hal_xrt_lite_allocator_query_buffer_compatibility, - .allocate_buffer = iree_hal_xrt_lite_allocator_allocate_buffer, - .deallocate_buffer = iree_hal_xrt_lite_allocator_deallocate_buffer, + .query_buffer_compatibility = query_buffer_compatibility, + .allocate_buffer = allocate_buffer, + .deallocate_buffer = deallocate_buffer, }; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index 9cc04cf46..bd6049238 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -42,7 +42,7 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( iree_allocator_t host_allocator, iree_hal_device_t** out_device); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 09dc19446..c9fd06bd7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -9,28 +9,56 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -iree_status_t iree_hal_xrt_lite_buffer::map_range( - iree_hal_mapping_mode_t mapping_mode, - iree_hal_memory_access_t memory_access, - iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { +namespace { +extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; +} + +struct iree_hal_xrt_lite_buffer { + iree_hal_buffer_t base; + std::unique_ptr bo; + iree_hal_buffer_release_callback_t release_callback; +}; + +iree_status_t invalidate_range(iree_hal_buffer_t* base_buffer, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + iree_hal_xrt_lite_buffer* buffer = + reinterpret_cast(base_buffer); + if (IREE_UNLIKELY(!buffer->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); + } + buffer->bo->sync(shim_xdna::direction::device2host, local_byte_length, + local_byte_offset); + return iree_ok_status(); +} + +iree_status_t map_range(iree_hal_buffer_t* base_buffer, + iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + iree_hal_xrt_lite_buffer* buffer = + reinterpret_cast(base_buffer); IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( iree_hal_buffer_memory_type( - reinterpret_cast(this)), + reinterpret_cast(buffer)), IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( iree_hal_buffer_allowed_usage( - reinterpret_cast(this)), + reinterpret_cast(buffer)), mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); - void* host_ptr = this->bo->map(); + void* host_ptr = buffer->bo->map(); // Should be guaranteed by previous checks. IREE_ASSERT(host_ptr != nullptr); - uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; + uint8_t* data_ptr = reinterpret_cast(host_ptr) + local_byte_offset; iree_status_t status = - this->invalidate_range(local_byte_offset, local_byte_length); + invalidate_range(base_buffer, local_byte_offset, local_byte_length); // If we mapped for discard, scribble over the bytes. This is not a mandated // behavior but it will make debugging issues easier. Alternatively for heap // buffers we could reallocate them such that ASAN yells, but that would @@ -44,38 +72,26 @@ iree_status_t iree_hal_xrt_lite_buffer::map_range( return status; } -iree_status_t iree_hal_xrt_lite_buffer::unmap_range( - iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - return this->flush_range(local_byte_offset, local_byte_length); -} - -iree_status_t iree_hal_xrt_lite_buffer::invalidate_range( - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - if (IREE_UNLIKELY(!this->bo)) { - return iree_make_status( - IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have device memory attached and cannot be mapped"); - } - this->bo->sync(shim_xdna::direction::device2host); - return iree_ok_status(); -} - -iree_status_t iree_hal_xrt_lite_buffer::flush_range( - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - if (IREE_UNLIKELY(!this->bo)) { +iree_status_t flush_range(iree_hal_buffer_t* base_buffer, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + iree_hal_xrt_lite_buffer* buffer = + reinterpret_cast(base_buffer); + if (IREE_UNLIKELY(!buffer->bo)) { return iree_make_status( IREE_STATUS_FAILED_PRECONDITION, "buffer does not have device memory attached and cannot be mapped"); } - this->bo->sync(shim_xdna::direction::host2device); + buffer->bo->sync(shim_xdna::direction::host2device, local_byte_length, + local_byte_offset); return iree_ok_status(); } -namespace { -extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; +iree_status_t unmap_range(iree_hal_buffer_t* base_buffer, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + return flush_range(base_buffer, local_byte_offset, local_byte_length); } iree_status_t iree_hal_xrt_lite_buffer_wrap( @@ -128,21 +144,13 @@ shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer) { return buffer->bo.get(); } -#define BUFFER_MEMBER_STATUS(member) \ - MEMBER_WRAPPER_STATUS(iree_hal_buffer_t, iree_hal_xrt_lite_buffer, member) - -BUFFER_MEMBER_STATUS(map_range); -BUFFER_MEMBER_STATUS(unmap_range); -BUFFER_MEMBER_STATUS(invalidate_range); -BUFFER_MEMBER_STATUS(flush_range); - namespace { const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable = { .recycle = iree_hal_buffer_recycle, .destroy = iree_hal_xrt_lite_buffer_destroy, - .map_range = iree_hal_xrt_lite_buffer_map_range, - .unmap_range = iree_hal_xrt_lite_buffer_unmap_range, - .invalidate_range = iree_hal_xrt_lite_buffer_invalidate_range, - .flush_range = iree_hal_xrt_lite_buffer_flush_range, + .map_range = map_range, + .unmap_range = unmap_range, + .invalidate_range = invalidate_range, + .flush_range = flush_range, }; } \ No newline at end of file diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h index c6f34b7b9..334478d14 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -11,28 +11,6 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -struct iree_hal_xrt_lite_buffer { - iree_hal_buffer_t base; - std::unique_ptr bo; - iree_hal_buffer_release_callback_t release_callback; - - iree_status_t map_range(iree_hal_mapping_mode_t mapping_mode, - iree_hal_memory_access_t memory_access, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping); - - iree_status_t unmap_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping); - - iree_status_t invalidate_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length); - - iree_status_t flush_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length); -}; - iree_status_t iree_hal_xrt_lite_buffer_wrap( std::unique_ptr bo, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index fcab63603..70da3dbdd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -48,120 +48,150 @@ struct iree_hal_xrt_lite_device { &block_pool); IREE_TRACE_ZONE_END(z0); } +}; - iree_status_t create_executable_cache( - iree_string_view_t identifier, iree_loop_t loop, - iree_hal_executable_cache_t** out_executable_cache) { - return iree_hal_xrt_lite_nop_executable_cache_create( - shim_device, identifier, host_allocator_, out_executable_cache); - } +iree_status_t create_executable_cache( + iree_hal_device_t* base_value, iree_string_view_t identifier, + iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + return iree_hal_xrt_lite_nop_executable_cache_create( + device->shim_device, identifier, device->host_allocator_, + out_executable_cache); +} - iree_status_t create_command_buffer( - iree_hal_command_buffer_mode_t mode, - iree_hal_command_category_t command_categories, - iree_hal_queue_affinity_t queue_affinity, - iree_host_size_t binding_capacity, - iree_hal_command_buffer_t** out_command_buffer) { - if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "unimplmented multi-shot command buffer"); - } - return iree_hal_deferred_command_buffer_create( - device_allocator_, mode, command_categories, binding_capacity, - &block_pool, host_allocator_, out_command_buffer); +iree_status_t create_command_buffer( + iree_hal_device_t* base_value, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_hal_command_buffer_t** out_command_buffer) { + if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "unimplmented multi-shot command buffer"); } + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + return iree_hal_deferred_command_buffer_create( + device->device_allocator_, mode, command_categories, binding_capacity, + &device->block_pool, device->host_allocator_, out_command_buffer); +} - iree_status_t create_semaphore(uint64_t initial_value, - iree_hal_semaphore_flags_t flags, - iree_hal_semaphore_t** out_semaphore) { - return iree_hal_xrt_lite_semaphore_create(host_allocator_, initial_value, - out_semaphore); - } +iree_status_t create_semaphore(iree_hal_device_t* base_value, + uint64_t initial_value, + iree_hal_semaphore_flags_t flags, + iree_hal_semaphore_t** out_semaphore) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + return iree_hal_xrt_lite_semaphore_create(device->host_allocator_, + initial_value, out_semaphore); +} - iree_status_t queue_execute( - iree_hal_queue_affinity_t queue_affinity, - const iree_hal_semaphore_list_t wait_semaphore_list, - const iree_hal_semaphore_list_t signal_semaphore_list, - iree_host_size_t command_buffer_count, - iree_hal_command_buffer_t* const* command_buffers, - iree_hal_buffer_binding_table_t const* binding_tables) { - IREE_TRACE_ZONE_BEGIN(z0); +iree_status_t queue_execute( + iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_host_size_t command_buffer_count, + iree_hal_command_buffer_t* const* command_buffers, + iree_hal_buffer_binding_table_t const* binding_tables) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + IREE_TRACE_ZONE_BEGIN(z0); - for (iree_host_size_t i = 0; i < command_buffer_count; i++) { - iree_hal_command_buffer_t* xrt_command_buffer = nullptr; - iree_hal_command_buffer_mode_t mode = - IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | - IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION | - IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_xrt_lite_direct_command_buffer_create( - shim_device, device_allocator_, mode, - IREE_HAL_COMMAND_CATEGORY_ANY, - /*binding_capacity=*/0, &block_pool, host_allocator_, - &xrt_command_buffer)); - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_deferred_command_buffer_apply( - command_buffers[i], xrt_command_buffer, - iree_hal_buffer_binding_table_empty())); - } - IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); + for (iree_host_size_t i = 0; i < command_buffer_count; i++) { + iree_hal_command_buffer_t* xrt_command_buffer = nullptr; + iree_hal_command_buffer_mode_t mode = + IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | + IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION | + IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_direct_command_buffer_create( + device->shim_device, device->device_allocator_, mode, + IREE_HAL_COMMAND_CATEGORY_ANY, + /*binding_capacity=*/0, &device->block_pool, + device->host_allocator_, &xrt_command_buffer)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_deferred_command_buffer_apply( + command_buffers[i], xrt_command_buffer, + iree_hal_buffer_binding_table_empty())); } + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} - void replace_device_allocator(iree_hal_allocator_t* new_allocator) { - iree_hal_allocator_retain(new_allocator); - iree_hal_allocator_release(this->device_allocator_); - this->device_allocator_ = new_allocator; - } +void replace_device_allocator(iree_hal_device_t* base_value, + iree_hal_allocator_t* new_allocator) { + iree_hal_allocator_retain(new_allocator); + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + iree_hal_allocator_release(device->device_allocator_); + device->device_allocator_ = new_allocator; +} - iree_status_t query_i64(iree_string_view_t category, iree_string_view_t key, - int64_t* out_value) { - *out_value = 0; - if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) { - *out_value = - iree_string_view_match_pattern(this->identifier, key) ? 1 : 0; - return iree_ok_status(); - } - - if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) { - *out_value = - iree_string_view_equal(key, IREE_SV("amdaie-pdi-fb")) ? 1 : 0; - return iree_ok_status(); - } - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); +iree_status_t query_i64(iree_hal_device_t* base_value, + iree_string_view_t category, iree_string_view_t key, + int64_t* out_value) { + *out_value = 0; + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) { + *out_value = + iree_string_view_match_pattern(device->identifier, key) ? 1 : 0; + return iree_ok_status(); } - iree_status_t queue_alloca( - iree_hal_queue_affinity_t queue_affinity, - const iree_hal_semaphore_list_t wait_semaphore_list, - const iree_hal_semaphore_list_t signal_semaphore_list, - iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, - iree_device_size_t allocation_size, - iree_hal_buffer_t** IREE_RESTRICT out_buffer) { - IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, - iree_infinite_timeout())); - IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( - device_allocator_, params, allocation_size, out_buffer)); - IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list)); + if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) { + *out_value = iree_string_view_equal(key, IREE_SV("amdaie-pdi-fb")) ? 1 : 0; return iree_ok_status(); } + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); +} - iree_string_view_t id() { return this->identifier; } +iree_status_t queue_alloca( + iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, + iree_infinite_timeout())); + IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( + device->device_allocator_, params, allocation_size, out_buffer)); + IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list)); + return iree_ok_status(); +} - void destroy() { - IREE_TRACE_ZONE_BEGIN(z0); +iree_string_view_t id(iree_hal_device_t* base_value) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + return device->identifier; +} - iree_hal_allocator_release(this->device_allocator_); - delete this->shim_device; - iree_allocator_free(host_allocator_, this); +void destroy(iree_hal_device_t* base_value) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + IREE_TRACE_ZONE_BEGIN(z0); - IREE_TRACE_ZONE_END(z0); - }; + iree_hal_allocator_release(device->device_allocator_); + delete device->shim_device; + iree_allocator_free(device->host_allocator_, device); - iree_allocator_t host_allocator() { return this->host_allocator_; } - iree_hal_allocator_t* device_allocator() { return this->device_allocator_; } + IREE_TRACE_ZONE_END(z0); }; +iree_allocator_t host_allocator(iree_hal_device_t* base_value) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + return device->host_allocator_; +} +iree_hal_allocator_t* device_allocator(iree_hal_device_t* base_value) { + iree_hal_xrt_lite_device* device = + reinterpret_cast(base_value); + return device->device_allocator_; +} + void iree_hal_xrt_lite_device_options_initialize( iree_hal_xrt_lite_device_options_t* out_options) { memset(out_options, 0, sizeof(*out_options)); @@ -174,59 +204,37 @@ iree_status_t iree_hal_xrt_lite_device_create( IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_device); *out_device = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - new iree_hal_xrt_lite_device(options, host_allocator); - iree_status_t status = iree_ok_status(); - iree_host_size_t total_size = sizeof(*device) + identifier.size; + iree_hal_xrt_lite_device* device = nullptr; + iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size; + IREE_RETURN_IF_ERROR( + iree_allocator_malloc(host_allocator, total_size, (void**)&device)); + device = new (device) iree_hal_xrt_lite_device(options, host_allocator); iree_string_view_append_to_buffer( identifier, &device->identifier, reinterpret_cast(device) + total_size - identifier.size); // TODO(max): device id *out_device = reinterpret_cast(device); - if (iree_status_is_ok(status)) { - } else { - iree_hal_device_release(reinterpret_cast(device)); - } IREE_TRACE_ZONE_END(z0); - return status; + return iree_ok_status(); } -#define DEVICE_MEMBER(member, return_t) \ - MEMBER_WRAPPER(iree_hal_device_t, iree_hal_xrt_lite_device, member, return_t) -#define DEVICE_MEMBER_STATUS(member) \ - MEMBER_WRAPPER_STATUS(iree_hal_device_t, iree_hal_xrt_lite_device, member) -#define DEVICE_MEMBER_VOID(member) \ - MEMBER_WRAPPER_VOID(iree_hal_device_t, iree_hal_xrt_lite_device, member) - -DEVICE_MEMBER(host_allocator, iree_allocator_t); -DEVICE_MEMBER(device_allocator, iree_hal_allocator_t*); -DEVICE_MEMBER(id, iree_string_view_t); -DEVICE_MEMBER_VOID(destroy); -DEVICE_MEMBER_STATUS(create_command_buffer); -DEVICE_MEMBER_STATUS(create_executable_cache); -DEVICE_MEMBER_STATUS(create_semaphore); -DEVICE_MEMBER_STATUS(queue_execute); -DEVICE_MEMBER_STATUS(query_i64); -DEVICE_MEMBER_STATUS(queue_alloca); -DEVICE_MEMBER_VOID(replace_device_allocator); - namespace { const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { - .destroy = iree_hal_xrt_lite_device_destroy, - .id = iree_hal_xrt_lite_device_id, - .host_allocator = iree_hal_xrt_lite_device_host_allocator, - .device_allocator = iree_hal_xrt_lite_device_device_allocator, - .replace_device_allocator = - iree_hal_xrt_lite_device_replace_device_allocator, - .query_i64 = iree_hal_xrt_lite_device_query_i64, - .create_command_buffer = iree_hal_xrt_lite_device_create_command_buffer, - .create_executable_cache = iree_hal_xrt_lite_device_create_executable_cache, - .create_semaphore = iree_hal_xrt_lite_device_create_semaphore, - .queue_alloca = iree_hal_xrt_lite_device_queue_alloca, - .queue_execute = iree_hal_xrt_lite_device_queue_execute, + .destroy = destroy, + .id = id, + .host_allocator = host_allocator, + .device_allocator = device_allocator, + .replace_device_allocator = replace_device_allocator, + .query_i64 = query_i64, + .create_command_buffer = create_command_buffer, + .create_executable_cache = create_executable_cache, + .create_semaphore = create_semaphore, + .queue_alloca = queue_alloca, + .queue_execute = queue_execute, .profiling_begin = unimplemented_ok_status, .profiling_flush = unimplemented_ok_status, .profiling_end = unimplemented_ok_status, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index 6733c9a2c..d3404c5f6 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -13,13 +13,7 @@ #include "iree/hal/utils/resource_set.h" #include "util.h" -// The max number of bindings per descriptor set allowed in the XRT HAL -// implementation. #define IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT 16 - -// The max number of descriptor sets allowed in the XRT HAL implementation. -// This depends on the general descriptor set planning in IREE and should adjust -// with it. #define IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_COUNT 4 struct iree_hal_xrt_lite_direct_command_buffer { @@ -116,21 +110,6 @@ static void iree_hal_xrt_lite_direct_command_buffer_destroy( IREE_TRACE_ZONE_END(z0); } -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_end( - iree_hal_command_buffer_t* base_command_buffer) { - iree_hal_xrt_lite_direct_command_buffer* command_buffer = - iree_hal_xrt_lite_direct_command_buffer_cast(base_command_buffer); - IREE_TRACE_ZONE_BEGIN(z0); - iree_arena_reset(&command_buffer->arena); - iree_hal_resource_set_free(command_buffer->resource_set); - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_resource_set_allocate(command_buffer->arena.block_pool, - &command_buffer->resource_set)); - - IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); -} - static iree_status_t iree_hal_xrt_lite_direct_command_buffer_update_buffer( iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer, iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) { @@ -197,9 +176,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &executable)); - std::unique_ptr context = - command_buffer->shim_device->create_hw_context(kernel_params.pdi, - kernel_params.kernel_name); size_t ctrl_code_size = kernel_params.asm_inst.size() * sizeof(uint32_t); auto bo_ctrl_code = command_buffer->shim_device->alloc_bo( ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); @@ -207,10 +183,13 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( memcpy(instr_buffer, kernel_params.asm_inst.data(), ctrl_code_size); bo_ctrl_code->sync(shim_xdna::direction::host2device); + shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); + std::unique_ptr context = + command_buffer->shim_device->create_hw_context(kernel_params.pdi, + kernel_params.kernel_name); shim_xdna::cuidx_t cu_idx = context->open_cu_context(kernel_params.kernel_name); - shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); ebuf.set_cu_idx(cu_idx); unsigned int opcode = 3; ebuf.add_arg_64(opcode); @@ -239,15 +218,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( return iree_ok_status(); } -static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch_indirect( - iree_hal_command_buffer_t* base_command_buffer, - iree_hal_executable_t* executable, int32_t entry_point, - iree_hal_buffer_ref_t workgroups_ref, iree_const_byte_span_t constants, - iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) { - return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "need xrt implementation of dispatch indirect"); -} - namespace { const iree_hal_command_buffer_vtable_t iree_hal_xrt_lite_direct_command_buffer_vtable = { @@ -258,7 +228,6 @@ const iree_hal_command_buffer_vtable_t .update_buffer = iree_hal_xrt_lite_direct_command_buffer_update_buffer, .copy_buffer = iree_hal_xrt_lite_direct_command_buffer_copy_buffer, .dispatch = iree_hal_xrt_lite_direct_command_buffer_dispatch, - .dispatch_indirect = - iree_hal_xrt_lite_direct_command_buffer_dispatch_indirect, + .dispatch_indirect = unimplemented, }; } // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h index 6aebaa624..705cf0909 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -18,15 +18,15 @@ extern "C" { // |out_command_buffer| must be released by the caller (see // iree_hal_command_buffer_release). iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - shim_xdna::device* shim_device, - iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, + shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, + iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_command_buffer_t** out_command_buffer); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index 33b84bcc7..686dda7bf 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -50,7 +50,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h index 0322944b3..45153266b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -22,8 +22,8 @@ extern "C" { // |out_executable_cache| must be released by the caller (see // iree_hal_executable_cache_release). iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( - shim_xdna::device* shim_device, - iree_string_view_t identifier, iree_allocator_t host_allocator, + shim_xdna::device* shim_device, iree_string_view_t identifier, + iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache); #ifdef __cplusplus diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h index 0a8623863..835a049db 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h @@ -21,7 +21,7 @@ iree_status_t iree_hal_xrt_lite_semaphore_create( iree_hal_semaphore_t** out_semaphore); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h index d983f27c1..2dba3fe86 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -27,17 +27,4 @@ iree_status_t unimplemented_ok_status(Params...) { template void unimplemented_ok_void(Params...) {} -#define MEMBER_WRAPPER(From, To, member, return_t) \ - template \ - static return_t To##_##member(From* b, Args... args) { \ - auto* obj = reinterpret_cast(b); \ - return obj->member(args...); \ - } - -#define MEMBER_WRAPPER_STATUS(From, To, member) \ - MEMBER_WRAPPER(From, To, member, iree_status_t) - -#define MEMBER_WRAPPER_VOID(From, To, member) \ - MEMBER_WRAPPER(From, To, member, void) - #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H From df0e66b7226bdaa93f51e656748c9ec0e4223df0 Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 16 Oct 2024 09:26:12 -0400 Subject: [PATCH 21/35] address comments --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 38 +++++------- .../src/iree-amd-aie/driver/xrt-lite/api.h | 8 --- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 47 ++++++++------- .../xrt-lite/cts/executable_cache_test.mlir | 2 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 59 ++++++++++--------- .../driver/xrt-lite/direct_command_buffer.cc | 2 +- .../driver/xrt-lite/direct_command_buffer.h | 12 +--- .../iree-amd-aie/driver/xrt-lite/driver.cc | 4 +- .../driver/xrt-lite/executable.cc | 16 +---- .../iree-amd-aie/driver/xrt-lite/executable.h | 13 +--- .../driver/xrt-lite/nop_executable_cache.cc | 2 +- .../driver/xrt-lite/nop_executable_cache.h | 16 +---- .../driver/xrt-lite/nop_semaphore.h | 8 --- .../driver/xrt-lite/shim/CMakeLists.txt | 2 +- .../driver/xrt-lite/shim/linux/CMakeLists.txt | 2 +- 15 files changed, 84 insertions(+), 147 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 509556dad..ceae8130c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -31,10 +31,10 @@ struct iree_hal_xrt_lite_allocator { } }; -iree_hal_buffer_compatibility_t query_buffer_compatibility( +iree_hal_buffer_compatibility_t +iree_hal_xrt_lite_allocator_query_buffer_compatibility( iree_hal_allocator_t* base_allocator, iree_hal_buffer_params_t* params, iree_device_size_t* allocation_size) { - // All buffers can be allocated on the heap. iree_hal_buffer_compatibility_t compatibility = IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; @@ -50,7 +50,6 @@ iree_hal_buffer_compatibility_t query_buffer_compatibility( } } - // We are now optimal. params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; // Guard against the corner case where the requested buffer size is 0. The @@ -65,16 +64,16 @@ iree_hal_buffer_compatibility_t query_buffer_compatibility( return compatibility; } -iree_status_t allocate_buffer(iree_hal_allocator_t* base_allocator, - const iree_hal_buffer_params_t* params, - iree_device_size_t allocation_size, - iree_hal_buffer_t** out_buffer) { +iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( + iree_hal_allocator_t* base_allocator, + const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size, + iree_hal_buffer_t** out_buffer) { iree_hal_xrt_lite_allocator* allocator = reinterpret_cast(base_allocator); - // Coerce options into those required by the current device. iree_hal_buffer_params_t compat_params = *params; - iree_hal_buffer_compatibility_t compatibility = query_buffer_compatibility( - base_allocator, &compat_params, &allocation_size); + iree_hal_buffer_compatibility_t compatibility = + iree_hal_xrt_lite_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); if (!iree_all_bits_set(compatibility, IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { return iree_make_status( @@ -104,8 +103,8 @@ iree_status_t allocate_buffer(iree_hal_allocator_t* base_allocator, return status; } -void deallocate_buffer(iree_hal_allocator_t* base_allocator, - iree_hal_buffer_t* base_buffer) { +void iree_hal_xrt_lite_allocator_deallocate_buffer( + iree_hal_allocator_t* base_allocator, iree_hal_buffer_t* base_buffer) { iree_hal_xrt_lite_allocator* allocator = reinterpret_cast(base_allocator); bool was_imported = false; @@ -117,12 +116,6 @@ void deallocate_buffer(iree_hal_allocator_t* base_allocator, iree_hal_buffer_destroy(base_buffer); } -static iree_hal_xrt_lite_allocator* iree_hal_xrt_lite_allocator_cast( - iree_hal_allocator_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_allocator_vtable); - return reinterpret_cast(base_value); -} - iree_status_t iree_hal_xrt_lite_allocator_create( iree_allocator_t host_allocator, shim_xdna::device* device, iree_hal_allocator_t** out_allocator) { @@ -151,7 +144,7 @@ static void iree_hal_xrt_lite_allocator_destroy( iree_hal_allocator_t* base_allocator) { IREE_ASSERT_ARGUMENT(base_allocator); iree_hal_xrt_lite_allocator* allocator = - iree_hal_xrt_lite_allocator_cast(base_allocator); + reinterpret_cast(base_allocator); IREE_TRACE_ZONE_BEGIN(z0); iree_hal_resource_release(&allocator->resource); @@ -173,8 +166,9 @@ const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { .host_allocator = iree_hal_xrt_lite_allocator_host_allocator, .trim = unimplemented_ok_status, .query_statistics = unimplemented_ok_void, - .query_buffer_compatibility = query_buffer_compatibility, - .allocate_buffer = allocate_buffer, - .deallocate_buffer = deallocate_buffer, + .query_buffer_compatibility = + iree_hal_xrt_lite_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_xrt_lite_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_xrt_lite_allocator_deallocate_buffer, }; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index bd6049238..18d1bbff0 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -10,10 +10,6 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - struct iree_hal_xrt_lite_device_options_t {}; IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( @@ -41,8 +37,4 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( const struct iree_hal_xrt_lite_device_options_t* options, iree_allocator_t host_allocator, iree_hal_device_t** out_device); -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index c9fd06bd7..01c94659d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -19,9 +19,9 @@ struct iree_hal_xrt_lite_buffer { iree_hal_buffer_release_callback_t release_callback; }; -iree_status_t invalidate_range(iree_hal_buffer_t* base_buffer, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { +iree_status_t iree_hal_xrt_lite_buffer_invalidate_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); if (IREE_UNLIKELY(!buffer->bo)) { @@ -34,12 +34,11 @@ iree_status_t invalidate_range(iree_hal_buffer_t* base_buffer, return iree_ok_status(); } -iree_status_t map_range(iree_hal_buffer_t* base_buffer, - iree_hal_mapping_mode_t mapping_mode, - iree_hal_memory_access_t memory_access, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { +iree_status_t iree_hal_xrt_lite_buffer_map_range( + iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( @@ -57,8 +56,8 @@ iree_status_t map_range(iree_hal_buffer_t* base_buffer, // Should be guaranteed by previous checks. IREE_ASSERT(host_ptr != nullptr); uint8_t* data_ptr = reinterpret_cast(host_ptr) + local_byte_offset; - iree_status_t status = - invalidate_range(base_buffer, local_byte_offset, local_byte_length); + iree_status_t status = iree_hal_xrt_lite_buffer_invalidate_range( + base_buffer, local_byte_offset, local_byte_length); // If we mapped for discard, scribble over the bytes. This is not a mandated // behavior but it will make debugging issues easier. Alternatively for heap // buffers we could reallocate them such that ASAN yells, but that would @@ -72,9 +71,9 @@ iree_status_t map_range(iree_hal_buffer_t* base_buffer, return status; } -iree_status_t flush_range(iree_hal_buffer_t* base_buffer, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { +iree_status_t iree_hal_xrt_lite_buffer_flush_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); if (IREE_UNLIKELY(!buffer->bo)) { @@ -87,11 +86,11 @@ iree_status_t flush_range(iree_hal_buffer_t* base_buffer, return iree_ok_status(); } -iree_status_t unmap_range(iree_hal_buffer_t* base_buffer, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - return flush_range(base_buffer, local_byte_offset, local_byte_length); +iree_status_t iree_hal_xrt_lite_buffer_unmap_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { + return iree_hal_xrt_lite_buffer_flush_range(base_buffer, local_byte_offset, + local_byte_length); } iree_status_t iree_hal_xrt_lite_buffer_wrap( @@ -148,9 +147,9 @@ namespace { const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable = { .recycle = iree_hal_buffer_recycle, .destroy = iree_hal_xrt_lite_buffer_destroy, - .map_range = map_range, - .unmap_range = unmap_range, - .invalidate_range = invalidate_range, - .flush_range = flush_range, + .map_range = iree_hal_xrt_lite_buffer_map_range, + .unmap_range = iree_hal_xrt_lite_buffer_unmap_range, + .invalidate_range = iree_hal_xrt_lite_buffer_invalidate_range, + .flush_range = iree_hal_xrt_lite_buffer_flush_range, }; -} \ No newline at end of file +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir index ca306e1e5..dedbcab6b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir @@ -30,4 +30,4 @@ hal.executable.source public @amdaie_fb { return } } -} \ No newline at end of file +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 70da3dbdd..69c1181a1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -50,7 +50,7 @@ struct iree_hal_xrt_lite_device { } }; -iree_status_t create_executable_cache( +iree_status_t iree_hal_xrt_lite_device_create_executable_cache( iree_hal_device_t* base_value, iree_string_view_t identifier, iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { iree_hal_xrt_lite_device* device = @@ -60,7 +60,7 @@ iree_status_t create_executable_cache( out_executable_cache); } -iree_status_t create_command_buffer( +iree_status_t iree_hal_xrt_lite_device_create_command_buffer( iree_hal_device_t* base_value, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, @@ -76,17 +76,16 @@ iree_status_t create_command_buffer( &device->block_pool, device->host_allocator_, out_command_buffer); } -iree_status_t create_semaphore(iree_hal_device_t* base_value, - uint64_t initial_value, - iree_hal_semaphore_flags_t flags, - iree_hal_semaphore_t** out_semaphore) { +iree_status_t iree_hal_xrt_lite_device_create_semaphore( + iree_hal_device_t* base_value, uint64_t initial_value, + iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) { iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); return iree_hal_xrt_lite_semaphore_create(device->host_allocator_, initial_value, out_semaphore); } -iree_status_t queue_execute( +iree_status_t iree_hal_xrt_lite_device_queue_execute( iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, const iree_hal_semaphore_list_t wait_semaphore_list, const iree_hal_semaphore_list_t signal_semaphore_list, @@ -118,8 +117,8 @@ iree_status_t queue_execute( return iree_ok_status(); } -void replace_device_allocator(iree_hal_device_t* base_value, - iree_hal_allocator_t* new_allocator) { +void iree_hal_xrt_lite_device_replace_device_allocator( + iree_hal_device_t* base_value, iree_hal_allocator_t* new_allocator) { iree_hal_allocator_retain(new_allocator); iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); @@ -127,9 +126,10 @@ void replace_device_allocator(iree_hal_device_t* base_value, device->device_allocator_ = new_allocator; } -iree_status_t query_i64(iree_hal_device_t* base_value, - iree_string_view_t category, iree_string_view_t key, - int64_t* out_value) { +iree_status_t iree_hal_xrt_lite_device_query_i64(iree_hal_device_t* base_value, + iree_string_view_t category, + iree_string_view_t key, + int64_t* out_value) { *out_value = 0; iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); @@ -146,7 +146,7 @@ iree_status_t query_i64(iree_hal_device_t* base_value, return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); } -iree_status_t queue_alloca( +iree_status_t iree_hal_xrt_lite_device_queue_alloca( iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, const iree_hal_semaphore_list_t wait_semaphore_list, const iree_hal_semaphore_list_t signal_semaphore_list, @@ -163,13 +163,13 @@ iree_status_t queue_alloca( return iree_ok_status(); } -iree_string_view_t id(iree_hal_device_t* base_value) { +iree_string_view_t iree_hal_xrt_lite_device_id(iree_hal_device_t* base_value) { iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); return device->identifier; } -void destroy(iree_hal_device_t* base_value) { +void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_value) { iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); IREE_TRACE_ZONE_BEGIN(z0); @@ -181,12 +181,14 @@ void destroy(iree_hal_device_t* base_value) { IREE_TRACE_ZONE_END(z0); }; -iree_allocator_t host_allocator(iree_hal_device_t* base_value) { +iree_allocator_t iree_hal_xrt_lite_device_host_allocator( + iree_hal_device_t* base_value) { iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); return device->host_allocator_; } -iree_hal_allocator_t* device_allocator(iree_hal_device_t* base_value) { +iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( + iree_hal_device_t* base_value) { iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); return device->device_allocator_; @@ -224,17 +226,18 @@ iree_status_t iree_hal_xrt_lite_device_create( namespace { const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { - .destroy = destroy, - .id = id, - .host_allocator = host_allocator, - .device_allocator = device_allocator, - .replace_device_allocator = replace_device_allocator, - .query_i64 = query_i64, - .create_command_buffer = create_command_buffer, - .create_executable_cache = create_executable_cache, - .create_semaphore = create_semaphore, - .queue_alloca = queue_alloca, - .queue_execute = queue_execute, + .destroy = iree_hal_xrt_lite_device_destroy, + .id = iree_hal_xrt_lite_device_id, + .host_allocator = iree_hal_xrt_lite_device_host_allocator, + .device_allocator = iree_hal_xrt_lite_device_device_allocator, + .replace_device_allocator = + iree_hal_xrt_lite_device_replace_device_allocator, + .query_i64 = iree_hal_xrt_lite_device_query_i64, + .create_command_buffer = iree_hal_xrt_lite_device_create_command_buffer, + .create_executable_cache = iree_hal_xrt_lite_device_create_executable_cache, + .create_semaphore = iree_hal_xrt_lite_device_create_semaphore, + .queue_alloca = iree_hal_xrt_lite_device_queue_alloca, + .queue_execute = iree_hal_xrt_lite_device_queue_execute, .profiling_begin = unimplemented_ok_status, .profiling_flush = unimplemented_ok_status, .profiling_end = unimplemented_ok_status, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index d3404c5f6..c63188b4e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -1,4 +1,4 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h index 705cf0909..1612c9509 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -1,4 +1,4 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,11 +11,7 @@ #include "iree/base/internal/arena.h" #include "iree/hal/api.h" -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// |out_command_buffer| must be released by the caller (see +// `out_command_buffer` must be released by the caller (see // iree_hal_command_buffer_release). iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, @@ -25,8 +21,4 @@ iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( iree_allocator_t host_allocator, iree_hal_command_buffer_t** out_command_buffer); -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 743834034..19d4b0e8f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -7,6 +7,8 @@ #include "iree-amd-aie/driver/xrt-lite/api.h" #include "util.h" +#define IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT 0 + typedef struct iree_hal_xrt_lite_driver_t { iree_hal_resource_t resource; iree_allocator_t host_allocator; @@ -69,8 +71,6 @@ static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { IREE_TRACE_ZONE_END(z0); } -#define IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT 0 - static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, iree_host_size_t* out_device_info_count, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index d55e3f007..ac16174a6 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -1,4 +1,4 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -25,12 +25,6 @@ iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast( return reinterpret_cast(base_value); } -// Verifies the structure of the flatbuffer so that we can avoid doing so during -// runtime. -// -// There are still some conditions we must be aware of (such as omitted names on -// functions with internal linkage), however we shouldn't need to bounds check -// anything within the flatbuffer after this succeeds. static iree_status_t iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( iree_const_byte_span_t flatbuffer_data) { @@ -41,9 +35,6 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( flatbuffer_data.data_length); } - // Run flatcc generated verification. This ensures all pointers are in-bounds - // and that we can safely walk the file, but not that the actual contents of - // the flatbuffer meet our expectations. int verify_ret = iree_amd_aie_hal_xrt_lite_ExecutableDef_verify_as_root( flatbuffer_data.data, flatbuffer_data.data_length); if (verify_ret != flatcc_verify_ok) { @@ -123,10 +114,6 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( iree_host_size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); - // Calculate the total number of characters across all entry point names. This - // is only required when tracing so that we can store copies of the names as - // the flatbuffer storing the strings may be released while the executable is - // still live. iree_host_size_t total_entry_point_name_chars = 0; IREE_TRACE({ for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; @@ -179,7 +166,6 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( asm_inst, asm_inst + flatbuffers_uint32_vec_len(asm_inst)); params->asm_inst = asmVector; - // Stash the entry point name in the string table for use when tracing. IREE_TRACE({ memcpy(string_table_buffer, params->kernel_name.data(), params->kernel_name.size()); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index 686dda7bf..c85d72d54 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -1,4 +1,4 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -17,11 +17,6 @@ #include "iree/base/tracing.h" #include "iree/hal/api.h" -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// Object and launch parameters for a compute kernel. struct iree_hal_xrt_lite_kernel_params_t { std::vector pdi; std::vector asm_inst; @@ -42,15 +37,11 @@ struct iree_hal_xrt_lite_native_executable_t { iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast( iree_hal_executable_t* base_value); -// |out_executable| must be released by the caller (see +// `out_executable` must be released by the caller (see // iree_hal_executable_release). iree_status_t iree_hal_xrt_lite_native_executable_create( shim_xdna::device* shim_device, const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc index 8d0be5ad4..963f6ff88 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -1,4 +1,4 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h index 45153266b..ed4a998b1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -1,4 +1,4 @@ -// Copyright 2023 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,23 +11,11 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// Creates a no-op executable cache that does not cache at all. -// This is useful to isolate pipeline caching behavior and verify compilation -// behavior. -// -// |out_executable_cache| must be released by the caller (see +// `out_executable_cache` must be released by the caller (see // iree_hal_executable_cache_release). iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( shim_xdna::device* shim_device, iree_string_view_t identifier, iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache); -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h index 835a049db..f7c5615e9 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h @@ -12,16 +12,8 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - iree_status_t iree_hal_xrt_lite_semaphore_create( iree_allocator_t host_allocator, uint64_t initial_value, iree_hal_semaphore_t** out_semaphore); -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt index ac1522216..c30c40e27 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt @@ -7,4 +7,4 @@ if(UNIX) add_subdirectory(linux) -endif() \ No newline at end of file +endif() diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt index afe3d583a..067d32f4a 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt @@ -5,4 +5,4 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -add_subdirectory(kmq) \ No newline at end of file +add_subdirectory(kmq) From 0045d4f1cbaf9e043e2003c22cbe984396b37072 Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 16 Oct 2024 10:01:47 -0400 Subject: [PATCH 22/35] add missing trace zones --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 24 ++- .../src/iree-amd-aie/driver/xrt-lite/api.h | 18 +-- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 49 +++++-- .../iree-amd-aie/driver/xrt-lite/device.cc | 137 ++++++++++++------ .../driver/xrt-lite/direct_command_buffer.cc | 39 ++--- .../iree-amd-aie/driver/xrt-lite/driver.cc | 50 ++++--- .../driver/xrt-lite/executable.cc | 26 ++-- .../iree-amd-aie/driver/xrt-lite/executable.h | 9 +- .../driver/xrt-lite/nop_executable_cache.cc | 54 ++++--- .../driver/xrt-lite/nop_semaphore.cc | 49 +++---- .../xrt-lite/registration/driver_module.c | 2 +- .../src/iree-amd-aie/driver/xrt-lite/util.h | 5 - 12 files changed, 273 insertions(+), 189 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index ceae8130c..a08432465 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -25,16 +25,20 @@ struct iree_hal_xrt_lite_allocator { shim_xdna::device* shim_device) : host_allocator(host_allocator), shim_device(shim_device) { IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_resource_initialize(&iree_hal_xrt_lite_allocator_vtable, &this->resource); + IREE_TRACE_ZONE_END(z0); } }; -iree_hal_buffer_compatibility_t +static iree_hal_buffer_compatibility_t iree_hal_xrt_lite_allocator_query_buffer_compatibility( iree_hal_allocator_t* base_allocator, iree_hal_buffer_params_t* params, iree_device_size_t* allocation_size) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_buffer_compatibility_t compatibility = IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; @@ -61,13 +65,16 @@ iree_hal_xrt_lite_allocator_query_buffer_compatibility( // can act safely even on buffer ranges that are not naturally aligned. *allocation_size = iree_host_align(*allocation_size, 4); + IREE_TRACE_ZONE_END(z0); return compatibility; } -iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( +static iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( iree_hal_allocator_t* base_allocator, const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size, iree_hal_buffer_t** out_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_allocator* allocator = reinterpret_cast(base_allocator); iree_hal_buffer_params_t compat_params = *params; @@ -76,6 +83,7 @@ iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( base_allocator, &compat_params, &allocation_size); if (!iree_all_bits_set(compatibility, IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { + IREE_TRACE_ZONE_END(z0); return iree_make_status( IREE_STATUS_INVALID_ARGUMENT, "allocator cannot allocate a buffer with the given parameters"); @@ -100,11 +108,15 @@ iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( } else { iree_hal_buffer_release(buffer); } + + IREE_TRACE_ZONE_END(z0); return status; } -void iree_hal_xrt_lite_allocator_deallocate_buffer( +static void iree_hal_xrt_lite_allocator_deallocate_buffer( iree_hal_allocator_t* base_allocator, iree_hal_buffer_t* base_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_allocator* allocator = reinterpret_cast(base_allocator); bool was_imported = false; @@ -114,6 +126,8 @@ void iree_hal_xrt_lite_allocator_deallocate_buffer( iree_hal_buffer_allocation_size(base_buffer))); } iree_hal_buffer_destroy(base_buffer); + + IREE_TRACE_ZONE_END(z0); } iree_status_t iree_hal_xrt_lite_allocator_create( @@ -155,8 +169,12 @@ static void iree_hal_xrt_lite_allocator_destroy( static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( const iree_hal_allocator_t* base_allocator) { + IREE_TRACE_ZONE_BEGIN(z0); + const iree_hal_xrt_lite_allocator* allocator = reinterpret_cast(base_allocator); + + IREE_TRACE_ZONE_END(z0); return allocator->host_allocator; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index 18d1bbff0..62b2a9fae 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -10,31 +10,31 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -struct iree_hal_xrt_lite_device_options_t {}; +struct iree_hal_xrt_lite_device_options {}; IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( - struct iree_hal_xrt_lite_device_options_t* out_params); + struct iree_hal_xrt_lite_device_options* out_params); -struct iree_hal_xrt_lite_driver_options_t { - struct iree_hal_xrt_lite_device_options_t default_device_options; +struct iree_hal_xrt_lite_driver_options { + struct iree_hal_xrt_lite_device_options default_device_options; }; IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( - struct iree_hal_xrt_lite_driver_options_t* out_options); + struct iree_hal_xrt_lite_driver_options* out_options); -// The provided |identifier| will be used by programs to distinguish the device +// The provided `identifier` will be used by programs to distinguish the device // type from other HAL implementations. If compiling programs with the IREE // compiler this must match the value used by IREE::HAL::TargetDevice. // -// |out_driver| must be released by the caller (see iree_hal_driver_release). +// `out_driver` must be released by the caller (see iree_hal_driver_release). IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, - const struct iree_hal_xrt_lite_driver_options_t* options, + const struct iree_hal_xrt_lite_driver_options* options, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver); IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( iree_string_view_t identifier, - const struct iree_hal_xrt_lite_device_options_t* options, + const struct iree_hal_xrt_lite_device_options* options, iree_allocator_t host_allocator, iree_hal_device_t** out_device); #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 01c94659d..b4cf1d7bf 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -19,38 +19,47 @@ struct iree_hal_xrt_lite_buffer { iree_hal_buffer_release_callback_t release_callback; }; -iree_status_t iree_hal_xrt_lite_buffer_invalidate_range( +static iree_status_t iree_hal_xrt_lite_buffer_invalidate_range( iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); if (IREE_UNLIKELY(!buffer->bo)) { + IREE_TRACE_ZONE_END(z0); return iree_make_status( IREE_STATUS_FAILED_PRECONDITION, "buffer does not have device memory attached and cannot be mapped"); } buffer->bo->sync(shim_xdna::direction::device2host, local_byte_length, local_byte_offset); + + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } -iree_status_t iree_hal_xrt_lite_buffer_map_range( +static iree_status_t iree_hal_xrt_lite_buffer_map_range( iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, iree_hal_memory_access_t memory_access, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); - IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( - iree_hal_buffer_memory_type( - reinterpret_cast(buffer)), - IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); - IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( - iree_hal_buffer_allowed_usage( - reinterpret_cast(buffer)), - mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT - ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT - : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type( + reinterpret_cast(buffer)), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_buffer_validate_usage( + iree_hal_buffer_allowed_usage( + reinterpret_cast(buffer)), + mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT + ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT + : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); void* host_ptr = buffer->bo->map(); // Should be guaranteed by previous checks. @@ -68,25 +77,33 @@ iree_status_t iree_hal_xrt_lite_buffer_map_range( } #endif // !NDEBUG mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + + IREE_TRACE_ZONE_END(z0); return status; } -iree_status_t iree_hal_xrt_lite_buffer_flush_range( +static iree_status_t iree_hal_xrt_lite_buffer_flush_range( iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); if (IREE_UNLIKELY(!buffer->bo)) { + IREE_TRACE_ZONE_END(z0); return iree_make_status( IREE_STATUS_FAILED_PRECONDITION, "buffer does not have device memory attached and cannot be mapped"); } + buffer->bo->sync(shim_xdna::direction::host2device, local_byte_length, local_byte_offset); + + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } -iree_status_t iree_hal_xrt_lite_buffer_unmap_range( +static iree_status_t iree_hal_xrt_lite_buffer_unmap_range( iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { return iree_hal_xrt_lite_buffer_flush_range(base_buffer, local_byte_offset, @@ -138,8 +155,12 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { } shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_buffer* buffer = reinterpret_cast(base_buffer); + + IREE_TRACE_ZONE_END(z0); return buffer->bo.get(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 69c1181a1..da7e793be 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -24,77 +24,94 @@ extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; struct iree_hal_xrt_lite_device { iree_hal_resource_t resource; iree_string_view_t identifier; - iree_allocator_t host_allocator_; - // not used - iree_hal_allocator_t* device_allocator_; - // Block pool used for command buffers with a larger block size (as command - // buffers can contain inlined data uploads). + iree_allocator_t host_allocator; + // TODO(max): not used because "device allocations" are performed through + // device + iree_hal_allocator_t* device_allocator; + // block pool used for command buffer allocations, uses a larger block size + // since command buffers can contain inlined data iree_arena_block_pool_t block_pool; shim_xdna::device* shim_device; - iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_options_t* options, + iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_options* options, iree_allocator_t host_allocator) { IREE_ASSERT_ARGUMENT(options); IREE_TRACE_ZONE_BEGIN(z0); iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); - this->host_allocator_ = host_allocator; + this->host_allocator = host_allocator; shim_device = new shim_xdna::device; iree_status_t status = iree_hal_xrt_lite_allocator_create( - host_allocator, shim_device, &device_allocator_); + host_allocator, shim_device, &device_allocator); IREE_ASSERT(iree_status_is_ok(status)); iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, &block_pool); + IREE_TRACE_ZONE_END(z0); } }; -iree_status_t iree_hal_xrt_lite_device_create_executable_cache( +static iree_status_t iree_hal_xrt_lite_device_create_executable_cache( iree_hal_device_t* base_value, iree_string_view_t identifier, iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); + + IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_nop_executable_cache_create( - device->shim_device, identifier, device->host_allocator_, + device->shim_device, identifier, device->host_allocator, out_executable_cache); } -iree_status_t iree_hal_xrt_lite_device_create_command_buffer( +static iree_status_t iree_hal_xrt_lite_device_create_command_buffer( iree_hal_device_t* base_value, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, iree_hal_command_buffer_t** out_command_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unimplmented multi-shot command buffer"); } + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); + + IREE_TRACE_ZONE_END(z0); return iree_hal_deferred_command_buffer_create( - device->device_allocator_, mode, command_categories, binding_capacity, - &device->block_pool, device->host_allocator_, out_command_buffer); + device->device_allocator, mode, command_categories, binding_capacity, + &device->block_pool, device->host_allocator, out_command_buffer); } -iree_status_t iree_hal_xrt_lite_device_create_semaphore( +static iree_status_t iree_hal_xrt_lite_device_create_semaphore( iree_hal_device_t* base_value, uint64_t initial_value, iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - return iree_hal_xrt_lite_semaphore_create(device->host_allocator_, + + IREE_TRACE_ZONE_END(z0); + return iree_hal_xrt_lite_semaphore_create(device->host_allocator, initial_value, out_semaphore); } -iree_status_t iree_hal_xrt_lite_device_queue_execute( +static iree_status_t iree_hal_xrt_lite_device_queue_execute( iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, const iree_hal_semaphore_list_t wait_semaphore_list, const iree_hal_semaphore_list_t signal_semaphore_list, iree_host_size_t command_buffer_count, iree_hal_command_buffer_t* const* command_buffers, iree_hal_buffer_binding_table_t const* binding_tables) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - IREE_TRACE_ZONE_BEGIN(z0); for (iree_host_size_t i = 0; i < command_buffer_count; i++) { iree_hal_command_buffer_t* xrt_command_buffer = nullptr; @@ -104,32 +121,38 @@ iree_status_t iree_hal_xrt_lite_device_queue_execute( IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_xrt_lite_direct_command_buffer_create( - device->shim_device, device->device_allocator_, mode, + device->shim_device, device->device_allocator, mode, IREE_HAL_COMMAND_CATEGORY_ANY, /*binding_capacity=*/0, &device->block_pool, - device->host_allocator_, &xrt_command_buffer)); + device->host_allocator, &xrt_command_buffer)); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_deferred_command_buffer_apply( command_buffers[i], xrt_command_buffer, iree_hal_buffer_binding_table_empty())); } + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } -void iree_hal_xrt_lite_device_replace_device_allocator( +static void iree_hal_xrt_lite_device_replace_device_allocator( iree_hal_device_t* base_value, iree_hal_allocator_t* new_allocator) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_allocator_retain(new_allocator); iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - iree_hal_allocator_release(device->device_allocator_); - device->device_allocator_ = new_allocator; + device->device_allocator = new_allocator; + iree_hal_allocator_release(device->device_allocator); + + IREE_TRACE_ZONE_END(z0); } -iree_status_t iree_hal_xrt_lite_device_query_i64(iree_hal_device_t* base_value, - iree_string_view_t category, - iree_string_view_t key, - int64_t* out_value) { +static iree_status_t iree_hal_xrt_lite_device_query_i64( + iree_hal_device_t* base_value, iree_string_view_t category, + iree_string_view_t key, int64_t* out_value) { + IREE_TRACE_ZONE_BEGIN(z0); + *out_value = 0; iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); @@ -143,65 +166,93 @@ iree_status_t iree_hal_xrt_lite_device_query_i64(iree_hal_device_t* base_value, *out_value = iree_string_view_equal(key, IREE_SV("amdaie-pdi-fb")) ? 1 : 0; return iree_ok_status(); } + + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); } -iree_status_t iree_hal_xrt_lite_device_queue_alloca( +static iree_status_t iree_hal_xrt_lite_device_queue_alloca( iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, const iree_hal_semaphore_list_t wait_semaphore_list, const iree_hal_semaphore_list_t signal_semaphore_list, iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, iree_device_size_t allocation_size, iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list, - iree_infinite_timeout())); - IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( - device->device_allocator_, params, allocation_size, out_buffer)); - IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_semaphore_list_wait(wait_semaphore_list, + iree_infinite_timeout())); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_allocator_allocate_buffer(device->device_allocator, params, + allocation_size, out_buffer)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_semaphore_list_signal(signal_semaphore_list)); + + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } -iree_string_view_t iree_hal_xrt_lite_device_id(iree_hal_device_t* base_value) { +static iree_string_view_t iree_hal_xrt_lite_device_id( + iree_hal_device_t* base_value) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); + + IREE_TRACE_ZONE_END(z0); return device->identifier; } -void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_value) { +static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_value) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_allocator_release(device->device_allocator_); + iree_hal_allocator_release(device->device_allocator); delete device->shim_device; - iree_allocator_free(device->host_allocator_, device); + iree_allocator_free(device->host_allocator, device); IREE_TRACE_ZONE_END(z0); }; -iree_allocator_t iree_hal_xrt_lite_device_host_allocator( +static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( iree_hal_device_t* base_value) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - return device->host_allocator_; + + IREE_TRACE_ZONE_END(z0); + return device->host_allocator; } -iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( + +static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( iree_hal_device_t* base_value) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_device* device = reinterpret_cast(base_value); - return device->device_allocator_; + + IREE_TRACE_ZONE_END(z0); + return device->device_allocator; } void iree_hal_xrt_lite_device_options_initialize( - iree_hal_xrt_lite_device_options_t* out_options) { + iree_hal_xrt_lite_device_options* out_options) { + IREE_TRACE_ZONE_BEGIN(z0); + memset(out_options, 0, sizeof(*out_options)); + + IREE_TRACE_ZONE_END(z0); } iree_status_t iree_hal_xrt_lite_device_create( iree_string_view_t identifier, - const iree_hal_xrt_lite_device_options_t* options, + const iree_hal_xrt_lite_device_options* options, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_device); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index c63188b4e..6a9573cf8 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -26,16 +26,6 @@ struct iree_hal_xrt_lite_direct_command_buffer { iree_arena_allocator_t arena; shim_xdna::device* shim_device; - - struct { - shim_xdna::bo* bindings[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; - // Offset and length are used to get the sub buffer at kernel launch. - iree_device_size_t - offsets[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; - iree_device_size_t - lengths[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT]; - - } descriptor_sets[IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_COUNT]; }; namespace { @@ -43,14 +33,6 @@ extern const iree_hal_command_buffer_vtable_t iree_hal_xrt_lite_direct_command_buffer_vtable; } // namespace -static iree_hal_xrt_lite_direct_command_buffer* -iree_hal_xrt_lite_direct_command_buffer_cast( - iree_hal_command_buffer_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, - &iree_hal_xrt_lite_direct_command_buffer_vtable); - return reinterpret_cast(base_value); -} - iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, iree_hal_command_buffer_mode_t mode, @@ -96,13 +78,15 @@ iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( return status; } + static void iree_hal_xrt_lite_direct_command_buffer_destroy( iree_hal_command_buffer_t* base_command_buffer) { - iree_hal_xrt_lite_direct_command_buffer* command_buffer = - iree_hal_xrt_lite_direct_command_buffer_cast(base_command_buffer); - iree_allocator_t host_allocator = command_buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_direct_command_buffer* command_buffer = + reinterpret_cast( + base_command_buffer); + iree_allocator_t host_allocator = command_buffer->host_allocator; iree_hal_resource_set_free(command_buffer->resource_set); iree_arena_deinitialize(&command_buffer->arena); iree_allocator_free(host_allocator, command_buffer); @@ -114,8 +98,8 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_update_buffer( iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer, iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) { IREE_TRACE_ZONE_BEGIN(z0); - const uint8_t* src = (const uint8_t*)source_buffer + source_offset; + const uint8_t* src = (const uint8_t*)source_buffer + source_offset; // No need to Allocate scratch space (in an arena) as the memcpy // used below is expected to be synchronized. shim_xdna::bo* target_device_buffer = iree_hal_xrt_lite_buffer_handle( @@ -160,16 +144,16 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( iree_hal_executable_t* base_executable, int32_t entry_point, const uint32_t workgroup_count[3], iree_const_byte_span_t constants, iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) { + IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_direct_command_buffer* command_buffer = reinterpret_cast( base_command_buffer); - IREE_TRACE_ZONE_BEGIN(z0); - // Lookup kernel parameters used for side-channeling additional launch // information from the compiler. - iree_hal_xrt_lite_native_executable_t* executable = - iree_hal_xrt_lite_native_executable_cast(base_executable); - iree_hal_xrt_lite_kernel_params_t kernel_params = + iree_hal_xrt_lite_native_executable* executable = + reinterpret_cast(base_executable); + iree_hal_xrt_lite_kernel_params kernel_params = executable->entry_points[entry_point]; IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -214,7 +198,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( } IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 19d4b0e8f..738dda0f3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -9,41 +9,39 @@ #define IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT 0 -typedef struct iree_hal_xrt_lite_driver_t { +struct iree_hal_xrt_lite_driver { iree_hal_resource_t resource; iree_allocator_t host_allocator; iree_string_view_t identifier; - iree_hal_xrt_lite_driver_options_t options; + iree_hal_xrt_lite_driver_options options; // + trailing identifier string storage -} iree_hal_xrt_lite_driver_t; +}; namespace { extern const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable; } -static iree_hal_xrt_lite_driver_t* iree_hal_xrt_lite_driver_cast( - iree_hal_driver_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_driver_vtable); - return reinterpret_cast(base_value); -} - void iree_hal_xrt_lite_driver_options_initialize( - iree_hal_xrt_lite_driver_options_t* out_options) { + iree_hal_xrt_lite_driver_options* out_options) { + IREE_TRACE_ZONE_BEGIN(z0); + memset(out_options, 0, sizeof(*out_options)); iree_hal_xrt_lite_device_options_initialize( &out_options->default_device_options); + + IREE_TRACE_ZONE_END(z0); } IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, - const iree_hal_xrt_lite_driver_options_t* options, + const iree_hal_xrt_lite_driver_options* options, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_driver); *out_driver = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_driver_t* driver = nullptr; + iree_hal_xrt_lite_driver* driver = nullptr; iree_host_size_t total_size = sizeof(*driver) + identifier.size; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, total_size, (void**)&driver)); @@ -61,8 +59,8 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( } static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { - iree_hal_xrt_lite_driver_t* driver = - iree_hal_xrt_lite_driver_cast(base_driver); + iree_hal_xrt_lite_driver* driver = + reinterpret_cast(base_driver); iree_allocator_t host_allocator = driver->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); @@ -75,6 +73,8 @@ static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, iree_host_size_t* out_device_info_count, iree_hal_device_info_t** out_device_infos) { + IREE_TRACE_ZONE_BEGIN(z0); + static const iree_hal_device_info_t device_infos[1] = { { .device_id = IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT, @@ -82,6 +82,8 @@ static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( }, }; *out_device_info_count = IREE_ARRAYSIZE(device_infos); + + IREE_TRACE_ZONE_END(z0); return iree_allocator_clone( host_allocator, iree_make_const_byte_span(device_infos, sizeof(device_infos)), @@ -92,10 +94,14 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, iree_host_size_t param_count, const iree_string_pair_t* params, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { - iree_hal_xrt_lite_driver_t* driver = - iree_hal_xrt_lite_driver_cast(base_driver); - iree_hal_xrt_lite_device_options_t options = + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_driver* driver = + reinterpret_cast(base_driver); + iree_hal_xrt_lite_device_options options = driver->options.default_device_options; + + IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_device_create(driver->identifier, &options, host_allocator, out_device); } @@ -105,10 +111,14 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( iree_string_view_t device_path, iree_host_size_t param_count, const iree_string_pair_t* params, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { - iree_hal_xrt_lite_driver_t* driver = - iree_hal_xrt_lite_driver_cast(base_driver); - iree_hal_xrt_lite_device_options_t options = + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_driver* driver = + reinterpret_cast(base_driver); + iree_hal_xrt_lite_device_options options = driver->options.default_device_options; + + IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_device_create(driver->identifier, &options, host_allocator, out_device); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index ac16174a6..b2fba7557 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -19,16 +19,13 @@ extern const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable; } // namespace -iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast( - iree_hal_executable_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); - return reinterpret_cast(base_value); -} - static iree_status_t iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( iree_const_byte_span_t flatbuffer_data) { + IREE_TRACE_ZONE_BEGIN(z0); + if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) { + IREE_TRACE_ZONE_END(z0); return iree_make_status( IREE_STATUS_INVALID_ARGUMENT, "flatbuffer data is not present or less than 16 bytes (%zu total)", @@ -38,6 +35,7 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( int verify_ret = iree_amd_aie_hal_xrt_lite_ExecutableDef_verify_as_root( flatbuffer_data.data, flatbuffer_data.data_length); if (verify_ret != flatcc_verify_ok) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "flatbuffer verification failed: %s", flatcc_verify_error_string(verify_ret)); @@ -50,12 +48,14 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); if (entry_point_count == 0) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no entry points found in the executable"); } for (size_t i = 0; i < entry_point_count; ++i) { if (!flatbuffers_string_len( flatbuffers_string_vec_at(entry_points_vec, i))) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "executable entry point %zu has no name", i); } @@ -65,6 +65,7 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); size_t number_pdi = iree_amd_aie_hal_xrt_lite_PdiDef_vec_len(pdis); if (number_pdi == 0) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no pdi present"); } @@ -73,12 +74,14 @@ iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( size_t number_asm_instr = iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_len(asm_instr); if (number_asm_instr != entry_point_count) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "number of entry points (%zu) and number of asm " "instructions (%zu) mismatched", entry_point_count, number_asm_instr); } + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -91,7 +94,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( IREE_TRACE_ZONE_BEGIN(z0); *out_executable = nullptr; - iree_hal_xrt_lite_native_executable_t* executable = nullptr; + iree_hal_xrt_lite_native_executable* executable = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( @@ -141,7 +144,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( executable->entry_point_count = entry_point_count; for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; entry_ordinal++) { - iree_hal_xrt_lite_kernel_params_t* params = + iree_hal_xrt_lite_kernel_params* params = &executable->entry_points[entry_ordinal]; params->kernel_name = flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); @@ -193,17 +196,18 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( } *out_executable = reinterpret_cast(executable); + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } static void iree_hal_xrt_lite_native_executable_destroy( iree_hal_executable_t* base_executable) { - iree_hal_xrt_lite_native_executable_t* executable = - iree_hal_xrt_lite_native_executable_cast(base_executable); - iree_allocator_t host_allocator = executable->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_native_executable* executable = + reinterpret_cast(base_executable); + iree_allocator_t host_allocator = executable->host_allocator; iree_allocator_free(host_allocator, executable); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index c85d72d54..ac88f2192 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -17,7 +17,7 @@ #include "iree/base/tracing.h" #include "iree/hal/api.h" -struct iree_hal_xrt_lite_kernel_params_t { +struct iree_hal_xrt_lite_kernel_params { std::vector pdi; std::vector asm_inst; std::string kernel_name; @@ -25,18 +25,15 @@ struct iree_hal_xrt_lite_kernel_params_t { IREE_TRACE(uint32_t source_line;) }; -struct iree_hal_xrt_lite_native_executable_t { +struct iree_hal_xrt_lite_native_executable { // Abstract resource used for injecting reference counting and vtable; must be // at offset 0. iree_hal_resource_t resource; iree_allocator_t host_allocator; iree_host_size_t entry_point_count; - iree_hal_xrt_lite_kernel_params_t entry_points[16]; + iree_hal_xrt_lite_kernel_params entry_points[16]; }; -iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast( - iree_hal_executable_t* base_value); - // `out_executable` must be released by the caller (see // iree_hal_executable_release). iree_status_t iree_hal_xrt_lite_native_executable_create( diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc index 963f6ff88..62b3ebfa4 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -13,26 +13,29 @@ #include "iree/base/api.h" #include "iree/base/tracing.h" -struct iree_hal_xrt_lite_nop_executable_cache_t { +namespace { +extern const iree_hal_executable_cache_vtable_t + iree_hal_xrt_lite_nop_executable_cache_vtable; +} // namespace + +struct iree_hal_xrt_lite_nop_executable_cache { // Abstract resource used for injecting reference counting and vtable; must be // at offset 0. iree_hal_resource_t resource; shim_xdna::device* shim_device; iree_allocator_t host_allocator; -}; -namespace { -extern const iree_hal_executable_cache_vtable_t - iree_hal_xrt_lite_nop_executable_cache_vtable; -} // namespace + iree_hal_xrt_lite_nop_executable_cache(shim_xdna::device* shim_device, + iree_allocator_t host_allocator) + : shim_device(shim_device), host_allocator(host_allocator) { + IREE_TRACE_ZONE_BEGIN(z0); -static iree_hal_xrt_lite_nop_executable_cache_t* -iree_hal_xrt_lite_nop_executable_cache_cast( - iree_hal_executable_cache_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, - &iree_hal_xrt_lite_nop_executable_cache_vtable); - return (iree_hal_xrt_lite_nop_executable_cache_t*)base_value; -} + iree_hal_resource_initialize(&iree_hal_xrt_lite_nop_executable_cache_vtable, + &resource); + + IREE_TRACE_ZONE_END(z0); + } +}; iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( shim_xdna::device* shim_device, iree_string_view_t identifier, @@ -42,26 +45,26 @@ iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( *out_executable_cache = nullptr; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_nop_executable_cache_t* executable_cache = nullptr; + iree_hal_xrt_lite_nop_executable_cache* executable_cache = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, sizeof(*executable_cache), (void**)&executable_cache)); - iree_hal_resource_initialize(&iree_hal_xrt_lite_nop_executable_cache_vtable, - &executable_cache->resource); - executable_cache->host_allocator = host_allocator; - executable_cache->shim_device = shim_device; + executable_cache = new (executable_cache) + iree_hal_xrt_lite_nop_executable_cache(shim_device, host_allocator); + *out_executable_cache = + reinterpret_cast(executable_cache); - *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache; IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } static void iree_hal_xrt_lite_nop_executable_cache_destroy( iree_hal_executable_cache_t* base_executable_cache) { - iree_hal_xrt_lite_nop_executable_cache_t* executable_cache = - iree_hal_xrt_lite_nop_executable_cache_cast(base_executable_cache); IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_nop_executable_cache* executable_cache = + reinterpret_cast( + base_executable_cache); iree_allocator_free(executable_cache->host_allocator, executable_cache); IREE_TRACE_ZONE_END(z0); @@ -79,8 +82,13 @@ static iree_status_t iree_hal_xrt_lite_nop_executable_cache_prepare_executable( iree_hal_executable_cache_t* base_executable_cache, const iree_hal_executable_params_t* executable_params, iree_hal_executable_t** out_executable) { - iree_hal_xrt_lite_nop_executable_cache_t* executable_cache = - iree_hal_xrt_lite_nop_executable_cache_cast(base_executable_cache); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_nop_executable_cache* executable_cache = + reinterpret_cast( + base_executable_cache); + + IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_native_executable_create( executable_cache->shim_device, executable_params, executable_cache->host_allocator, out_executable); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc index 173db9483..c3e285b82 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc @@ -10,21 +10,22 @@ #include "iree/hal/utils/semaphore_base.h" #include "util.h" -struct iree_hal_xrt_lite_semaphore_t { - iree_hal_semaphore_t base; - iree_atomic_int64_t value; - iree_allocator_t host_allocator; -}; - namespace { extern const iree_hal_semaphore_vtable_t iree_hal_xrt_lite_semaphore_vtable; } // namespace -static iree_hal_xrt_lite_semaphore_t* iree_hal_xrt_lite_semaphore_cast( - iree_hal_semaphore_t* base_value) { - IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_semaphore_vtable); - return (iree_hal_xrt_lite_semaphore_t*)base_value; -} +struct iree_hal_xrt_lite_semaphore { + iree_hal_semaphore_t base; + iree_atomic_int64_t value; + iree_allocator_t host_allocator; + + iree_hal_xrt_lite_semaphore(uint64_t initial_value, + iree_allocator_t host_allocator) + : value(initial_value), host_allocator(host_allocator) { + iree_hal_semaphore_initialize(&iree_hal_xrt_lite_semaphore_vtable, &base); + iree_atomic_store_int64(&value, initial_value, iree_memory_order_release); + } +}; iree_status_t iree_hal_xrt_lite_semaphore_create( iree_allocator_t host_allocator, uint64_t initial_value, @@ -32,29 +33,25 @@ iree_status_t iree_hal_xrt_lite_semaphore_create( IREE_ASSERT_ARGUMENT(out_semaphore); IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_semaphore_t* semaphore = nullptr; - iree_status_t status = iree_allocator_malloc( - host_allocator, sizeof(*semaphore), (void**)&semaphore); - if (iree_status_is_ok(status)) { - iree_hal_semaphore_initialize(&iree_hal_xrt_lite_semaphore_vtable, - &semaphore->base); - semaphore->host_allocator = host_allocator; - iree_atomic_store_int64(&semaphore->value, initial_value, - iree_memory_order_release); - *out_semaphore = &semaphore->base; - } + iree_hal_xrt_lite_semaphore* semaphore = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*semaphore), + (void**)&semaphore)); + semaphore = new (semaphore) + iree_hal_xrt_lite_semaphore(initial_value, host_allocator); + *out_semaphore = &semaphore->base; IREE_TRACE_ZONE_END(z0); - return status; + return iree_ok_status(); } static void iree_hal_xrt_lite_semaphore_destroy( iree_hal_semaphore_t* base_semaphore) { - iree_hal_xrt_lite_semaphore_t* semaphore = - iree_hal_xrt_lite_semaphore_cast(base_semaphore); - iree_allocator_t host_allocator = semaphore->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_semaphore* semaphore = + reinterpret_cast(base_semaphore); + iree_allocator_t host_allocator = semaphore->host_allocator; iree_hal_semaphore_deinitialize(&semaphore->base); iree_allocator_free(host_allocator, semaphore); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c index d0c82e79f..928305857 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -34,7 +34,7 @@ static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( // used in native tools that have access to the flags library. Programmatic // creation of the driver and devices will bypass this file and pass the // options via this struct or key-value string parameters. - struct iree_hal_xrt_lite_driver_options_t options; + struct iree_hal_xrt_lite_driver_options options; iree_hal_xrt_lite_driver_options_initialize(&options); iree_status_t status = iree_hal_xrt_lite_driver_create( diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h index 2dba3fe86..ff3de896c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -14,11 +14,6 @@ iree_status_t unimplemented(Params...) { return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unimplemented"); } -template -void unimplemented(Params...) { - IREE_ASSERT(false && "unimplemented"); -} - template iree_status_t unimplemented_ok_status(Params...) { return iree_ok_status(); From 7356f7a28b31c4f1a91790d47f9b58cb8319f9cc Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 16 Oct 2024 10:49:18 -0400 Subject: [PATCH 23/35] remove smart pointers --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 6 ++--- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 10 ++++---- .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 2 +- .../driver/xrt-lite/direct_command_buffer.cc | 9 ++++--- .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 24 +++++++++---------- .../driver/xrt-lite/shim/linux/kmq/bo.h | 6 +++++ .../driver/xrt-lite/shim/linux/kmq/device.cpp | 14 +++++------ .../driver/xrt-lite/shim/linux/kmq/device.h | 10 ++++---- .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 3 +++ 9 files changed, 46 insertions(+), 38 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index a08432465..7b4fd00bd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -90,11 +90,11 @@ static iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( } uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; - std::unique_ptr bo = - allocator->shim_device->alloc_bo(allocation_size, flags); + shim_xdna::bo* bo = + allocator->shim_device->alloc_bo(allocation_size, flags).release(); iree_hal_buffer_t* buffer = nullptr; iree_status_t status = iree_hal_xrt_lite_buffer_wrap( - std::move(bo), reinterpret_cast(allocator), + bo, reinterpret_cast(allocator), compat_params.type, compat_params.access, compat_params.usage, allocation_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index b4cf1d7bf..40e84daae 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -15,7 +15,7 @@ extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; struct iree_hal_xrt_lite_buffer { iree_hal_buffer_t base; - std::unique_ptr bo; + shim_xdna::bo* bo; iree_hal_buffer_release_callback_t release_callback; }; @@ -111,7 +111,7 @@ static iree_status_t iree_hal_xrt_lite_buffer_unmap_range( } iree_status_t iree_hal_xrt_lite_buffer_wrap( - std::unique_ptr bo, iree_hal_allocator_t* allocator, + shim_xdna::bo* bo, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, @@ -130,7 +130,7 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( memory_type, allowed_access, allowed_usage, &iree_hal_xrt_lite_buffer_vtable, &buffer->base); buffer->release_callback = release_callback; - buffer->bo = std::move(bo); + buffer->bo = bo; *out_buffer = &buffer->base; IREE_TRACE_ZONE_END(z0); @@ -148,7 +148,7 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { base_buffer); } - buffer->bo.reset(); + delete buffer->bo; iree_allocator_free(host_allocator, buffer); IREE_TRACE_ZONE_END(z0); @@ -161,7 +161,7 @@ shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer) { reinterpret_cast(base_buffer); IREE_TRACE_ZONE_END(z0); - return buffer->bo.get(); + return buffer->bo; } namespace { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h index 334478d14..b70517d1c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -12,7 +12,7 @@ #include "iree/hal/api.h" iree_status_t iree_hal_xrt_lite_buffer_wrap( - std::unique_ptr bo, iree_hal_allocator_t* allocator, + shim_xdna::bo* bo, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index 6a9573cf8..0d3b334f0 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -168,11 +168,10 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( bo_ctrl_code->sync(shim_xdna::direction::host2device); shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); - std::unique_ptr context = - command_buffer->shim_device->create_hw_context(kernel_params.pdi, - kernel_params.kernel_name); + shim_xdna::hw_ctx context = command_buffer->shim_device->create_hw_context( + kernel_params.pdi, kernel_params.kernel_name); shim_xdna::cuidx_t cu_idx = - context->open_cu_context(kernel_params.kernel_name); + context.open_cu_context(kernel_params.kernel_name); ebuf.set_cu_idx(cu_idx); unsigned int opcode = 3; @@ -187,7 +186,7 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( bo->sync(shim_xdna::direction::host2device); } - shim_xdna::hw_q* hwq = context->get_hw_queue(); + shim_xdna::hw_q* hwq = context.get_hw_queue(); hwq->issue_command(ebuf.get_exec_buf_bo()); hwq->wait_command(ebuf.get_exec_buf_bo(), 0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index 11f0f04df..c2eec9fd9 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -264,18 +264,6 @@ void bo::import_bo() { void bo::free_bo() { m_drm_bo.reset(); } -bo::bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags) - : bo(p, ctx_id, size, flags, flag_to_type(flags)) { - if (m_type == AMDXDNA_BO_INVALID) - shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); -} - -bo::bo(const pdev &p, uint32_t ctx_id, size_t size, uint32_t flags) - : bo(p, ctx_id, size, shim_xcl_bo_flags{.flags = flags}) { - if (m_type == AMDXDNA_BO_INVALID) - shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); -} - bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, amdxdna_bo_type type) : m_pdev(pdev), @@ -338,6 +326,18 @@ bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); } +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags) + : bo(p, ctx_id, size, flags, flag_to_type(flags)) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, uint32_t flags) + : bo(p, ctx_id, size, shim_xcl_bo_flags{.flags = flags}) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + bo::bo(const pdev &p, int ehdl) : m_pdev(p), m_import(ehdl) { import_bo(); mmap_bo(); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h index 24b57566f..8742c8e28 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -40,6 +40,9 @@ struct drm_bo { uint64_t m_vaddr = AMDXDNA_INVALID_ADDR; drm_bo(bo &parent, const amdxdna_drm_get_bo_info &bo_info); + // no copying + drm_bo(const drm_bo &) = delete; + drm_bo &operator=(const drm_bo &) = delete; ~drm_bo(); }; @@ -73,6 +76,9 @@ struct bo { // Support BO creation from internal bo(const pdev &p, size_t size, amdxdna_bo_type type); ~bo(); + // no copying + bo(const bo &) = delete; + bo &operator=(const bo &) = delete; void *map() const; void unmap(void *addr); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 153e11a88..01c4a6bae 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -149,15 +149,15 @@ device::~device() { SHIM_DEBUG("Destroying KMQ device"); } const pdev &device::get_pdev() const { return m_pdev; } -std::unique_ptr device::create_hw_context( - const std::vector &pdi, const std::string &cu_name, - const std::map &qos) { - return std::make_unique(*this, pdi, cu_name, qos); +hw_ctx device::create_hw_context(const std::vector &pdi, + const std::string &cu_name, + const std::map &qos) { + return hw_ctx(*this, pdi, cu_name, qos); } -std::unique_ptr device::create_hw_context( - const std::vector &pdi, const std::string &cu_name) { - return std::make_unique(*this, pdi, cu_name); +hw_ctx device::create_hw_context(const std::vector &pdi, + const std::string &cu_name) { + return hw_ctx(*this, pdi, cu_name); } std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index 1076f72f1..f483960e1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -44,11 +44,11 @@ struct device { std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); - std::unique_ptr create_hw_context( - const std::vector &pdi, const std::string &cu_name, - const std::map &qos); - std::unique_ptr create_hw_context(const std::vector &pdi, - const std::string &cu_name); + hw_ctx create_hw_context(const std::vector &pdi, + const std::string &cu_name, + const std::map &qos); + hw_ctx create_hw_context(const std::vector &pdi, + const std::string &cu_name); std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, uint32_t size); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index 15fc2b481..b989c60ce 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -56,6 +56,9 @@ struct hw_ctx { const std::string &cu_name, const std::map &qos = {}); ~hw_ctx(); + // no copying + hw_ctx(const hw_ctx &) = delete; + hw_ctx &operator=(const hw_ctx &) = delete; std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); std::unique_ptr import_bo(pid_t, int); From dc3d7b92691daeba3d8a94e794bd385499603f98 Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 16 Oct 2024 14:53:12 -0400 Subject: [PATCH 24/35] remove unnecessary sync to device --- .../src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index 0d3b334f0..a5d2c6dbd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -183,7 +183,6 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); ebuf.add_arg_bo(*bo); - bo->sync(shim_xdna::direction::host2device); } shim_xdna::hw_q* hwq = context.get_hw_queue(); @@ -193,6 +192,8 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( for (iree_host_size_t j = 0; j < bindings.count; ++j) { shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + // TODO(max): this should be happening automatically via a call to some + // buffer API that performs the sync (maybe invalidate_range) bo->sync(shim_xdna::direction::device2host); } From 57e0f2c1162e860ff5db86a428f6924529e46fcb Mon Sep 17 00:00:00 2001 From: makslevental Date: Wed, 16 Oct 2024 16:10:07 -0400 Subject: [PATCH 25/35] undo reinterpret_cast --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 18 +++-- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 26 +++---- .../iree-amd-aie/driver/xrt-lite/device.cc | 74 ++++++++++--------- .../driver/xrt-lite/direct_command_buffer.cc | 35 +++++---- .../iree-amd-aie/driver/xrt-lite/driver.cc | 25 ++++--- .../driver/xrt-lite/executable.cc | 27 ++++--- .../iree-amd-aie/driver/xrt-lite/executable.h | 5 +- .../driver/xrt-lite/nop_executable_cache.cc | 15 ++-- .../driver/xrt-lite/nop_semaphore.cc | 6 +- .../src/iree-amd-aie/driver/xrt-lite/util.h | 10 ++- 10 files changed, 138 insertions(+), 103 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 7b4fd00bd..275781e67 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -76,7 +76,9 @@ static iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_allocator* allocator = - reinterpret_cast(base_allocator); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + iree_hal_xrt_lite_allocator); iree_hal_buffer_params_t compat_params = *params; iree_hal_buffer_compatibility_t compatibility = iree_hal_xrt_lite_allocator_query_buffer_compatibility( @@ -118,7 +120,9 @@ static void iree_hal_xrt_lite_allocator_deallocate_buffer( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_allocator* allocator = - reinterpret_cast(base_allocator); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + iree_hal_xrt_lite_allocator); bool was_imported = false; if (!was_imported) { IREE_STATISTICS(iree_hal_allocator_statistics_record_free( @@ -157,10 +161,12 @@ iree_status_t iree_hal_xrt_lite_allocator_create( static void iree_hal_xrt_lite_allocator_destroy( iree_hal_allocator_t* base_allocator) { IREE_ASSERT_ARGUMENT(base_allocator); - iree_hal_xrt_lite_allocator* allocator = - reinterpret_cast(base_allocator); IREE_TRACE_ZONE_BEGIN(z0); + iree_hal_xrt_lite_allocator* allocator = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + iree_hal_xrt_lite_allocator); iree_hal_resource_release(&allocator->resource); iree_allocator_free(allocator->host_allocator, allocator); @@ -172,7 +178,9 @@ static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( IREE_TRACE_ZONE_BEGIN(z0); const iree_hal_xrt_lite_allocator* allocator = - reinterpret_cast(base_allocator); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + const iree_hal_xrt_lite_allocator); IREE_TRACE_ZONE_END(z0); return allocator->host_allocator; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 40e84daae..6c0dff4f1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -24,8 +24,8 @@ static iree_status_t iree_hal_xrt_lite_buffer_invalidate_range( iree_device_size_t local_byte_length) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_buffer* buffer = - reinterpret_cast(base_buffer); + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); if (IREE_UNLIKELY(!buffer->bo)) { IREE_TRACE_ZONE_END(z0); return iree_make_status( @@ -46,8 +46,8 @@ static iree_status_t iree_hal_xrt_lite_buffer_map_range( iree_hal_buffer_mapping_t* mapping) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_buffer* buffer = - reinterpret_cast(base_buffer); + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_buffer_validate_memory_type( iree_hal_buffer_memory_type( @@ -87,8 +87,8 @@ static iree_status_t iree_hal_xrt_lite_buffer_flush_range( iree_device_size_t local_byte_length) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_buffer* buffer = - reinterpret_cast(base_buffer); + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); if (IREE_UNLIKELY(!buffer->bo)) { IREE_TRACE_ZONE_END(z0); return iree_make_status( @@ -123,8 +123,8 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( iree_hal_xrt_lite_buffer* buffer = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, - iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer)); + z0, iree_allocator_malloc(host_allocator, sizeof(*buffer), + reinterpret_cast(&buffer))); iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, allocation_size, byte_offset, byte_length, memory_type, allowed_access, allowed_usage, @@ -138,11 +138,11 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( } static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { - iree_hal_xrt_lite_buffer* buffer = - reinterpret_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); IREE_TRACE_ZONE_BEGIN(z0); + iree_allocator_t host_allocator = base_buffer->host_allocator; if (buffer->release_callback.fn) { buffer->release_callback.fn(buffer->release_callback.user_data, base_buffer); @@ -157,8 +157,8 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_buffer* buffer = - reinterpret_cast(base_buffer); + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); IREE_TRACE_ZONE_END(z0); return buffer->bo; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index da7e793be..6239b10e4 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -23,7 +23,6 @@ extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; struct iree_hal_xrt_lite_device { iree_hal_resource_t resource; - iree_string_view_t identifier; iree_allocator_t host_allocator; // TODO(max): not used because "device allocations" are performed through // device @@ -32,6 +31,9 @@ struct iree_hal_xrt_lite_device { // since command buffers can contain inlined data iree_arena_block_pool_t block_pool; shim_xdna::device* shim_device; + // should come last; see the definition of total_size below in + // iree_hal_xrt_lite_device_create + iree_string_view_t identifier; iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_options* options, iree_allocator_t host_allocator) { @@ -53,12 +55,12 @@ struct iree_hal_xrt_lite_device { }; static iree_status_t iree_hal_xrt_lite_device_create_executable_cache( - iree_hal_device_t* base_value, iree_string_view_t identifier, + iree_hal_device_t* base_device, iree_string_view_t identifier, iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_nop_executable_cache_create( @@ -67,7 +69,7 @@ static iree_status_t iree_hal_xrt_lite_device_create_executable_cache( } static iree_status_t iree_hal_xrt_lite_device_create_command_buffer( - iree_hal_device_t* base_value, iree_hal_command_buffer_mode_t mode, + iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, iree_hal_command_buffer_t** out_command_buffer) { @@ -79,8 +81,8 @@ static iree_status_t iree_hal_xrt_lite_device_create_command_buffer( "unimplmented multi-shot command buffer"); } - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_TRACE_ZONE_END(z0); return iree_hal_deferred_command_buffer_create( @@ -89,12 +91,12 @@ static iree_status_t iree_hal_xrt_lite_device_create_command_buffer( } static iree_status_t iree_hal_xrt_lite_device_create_semaphore( - iree_hal_device_t* base_value, uint64_t initial_value, + iree_hal_device_t* base_device, uint64_t initial_value, iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_semaphore_create(device->host_allocator, @@ -102,7 +104,7 @@ static iree_status_t iree_hal_xrt_lite_device_create_semaphore( } static iree_status_t iree_hal_xrt_lite_device_queue_execute( - iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, const iree_hal_semaphore_list_t wait_semaphore_list, const iree_hal_semaphore_list_t signal_semaphore_list, iree_host_size_t command_buffer_count, @@ -110,8 +112,8 @@ static iree_status_t iree_hal_xrt_lite_device_queue_execute( iree_hal_buffer_binding_table_t const* binding_tables) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); for (iree_host_size_t i = 0; i < command_buffer_count; i++) { iree_hal_command_buffer_t* xrt_command_buffer = nullptr; @@ -136,12 +138,12 @@ static iree_status_t iree_hal_xrt_lite_device_queue_execute( } static void iree_hal_xrt_lite_device_replace_device_allocator( - iree_hal_device_t* base_value, iree_hal_allocator_t* new_allocator) { + iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) { IREE_TRACE_ZONE_BEGIN(z0); iree_hal_allocator_retain(new_allocator); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); device->device_allocator = new_allocator; iree_hal_allocator_release(device->device_allocator); @@ -149,13 +151,13 @@ static void iree_hal_xrt_lite_device_replace_device_allocator( } static iree_status_t iree_hal_xrt_lite_device_query_i64( - iree_hal_device_t* base_value, iree_string_view_t category, + iree_hal_device_t* base_device, iree_string_view_t category, iree_string_view_t key, int64_t* out_value) { IREE_TRACE_ZONE_BEGIN(z0); *out_value = 0; - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) { *out_value = iree_string_view_match_pattern(device->identifier, key) ? 1 : 0; @@ -172,7 +174,7 @@ static iree_status_t iree_hal_xrt_lite_device_query_i64( } static iree_status_t iree_hal_xrt_lite_device_queue_alloca( - iree_hal_device_t* base_value, iree_hal_queue_affinity_t queue_affinity, + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, const iree_hal_semaphore_list_t wait_semaphore_list, const iree_hal_semaphore_list_t signal_semaphore_list, iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, @@ -180,8 +182,8 @@ static iree_status_t iree_hal_xrt_lite_device_queue_alloca( iree_hal_buffer_t** IREE_RESTRICT out_buffer) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_semaphore_list_wait(wait_semaphore_list, iree_infinite_timeout())); @@ -196,21 +198,21 @@ static iree_status_t iree_hal_xrt_lite_device_queue_alloca( } static iree_string_view_t iree_hal_xrt_lite_device_id( - iree_hal_device_t* base_value) { + iree_hal_device_t* base_device) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_TRACE_ZONE_END(z0); return device->identifier; } -static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_value) { +static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); iree_hal_allocator_release(device->device_allocator); delete device->shim_device; @@ -220,22 +222,22 @@ static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_value) { }; static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( - iree_hal_device_t* base_value) { + iree_hal_device_t* base_device) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_TRACE_ZONE_END(z0); return device->host_allocator; } static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( - iree_hal_device_t* base_value) { + iree_hal_device_t* base_device) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_device* device = - reinterpret_cast(base_value); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); IREE_TRACE_ZONE_END(z0); return device->device_allocator; @@ -262,8 +264,8 @@ iree_status_t iree_hal_xrt_lite_device_create( iree_hal_xrt_lite_device* device = nullptr; iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size; - IREE_RETURN_IF_ERROR( - iree_allocator_malloc(host_allocator, total_size, (void**)&device)); + IREE_RETURN_IF_ERROR(iree_allocator_malloc( + host_allocator, total_size, reinterpret_cast(&device))); device = new (device) iree_hal_xrt_lite_device(options, host_allocator); iree_string_view_append_to_buffer( identifier, &device->identifier, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index a5d2c6dbd..f5a79dc10 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -10,11 +10,8 @@ #include "iree-amd-aie/driver/xrt-lite/executable.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" #include "iree/hal/utils/resource_set.h" -#include "util.h" - -#define IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_BINDING_COUNT 16 -#define IREE_HAL_XRT_LITE_MAX_DESCRIPTOR_SET_COUNT 4 struct iree_hal_xrt_lite_direct_command_buffer { iree_hal_command_buffer_t base; @@ -58,10 +55,11 @@ iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( sizeof(*command_buffer) + iree_hal_command_buffer_validation_state_size( mode, binding_capacity), - (void**)&command_buffer)); + reinterpret_cast(&command_buffer))); iree_hal_command_buffer_initialize( device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY, - binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer), + binding_capacity, + reinterpret_cast(command_buffer) + sizeof(*command_buffer), &iree_hal_xrt_lite_direct_command_buffer_vtable, &command_buffer->base); command_buffer->host_allocator = host_allocator; command_buffer->shim_device = shim_device; @@ -84,8 +82,9 @@ static void iree_hal_xrt_lite_direct_command_buffer_destroy( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_direct_command_buffer* command_buffer = - reinterpret_cast( - base_command_buffer); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_command_buffer, iree_hal_xrt_lite_direct_command_buffer_vtable, + iree_hal_xrt_lite_direct_command_buffer); iree_allocator_t host_allocator = command_buffer->host_allocator; iree_hal_resource_set_free(command_buffer->resource_set); iree_arena_deinitialize(&command_buffer->arena); @@ -99,13 +98,14 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_update_buffer( iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) { IREE_TRACE_ZONE_BEGIN(z0); - const uint8_t* src = (const uint8_t*)source_buffer + source_offset; + const uint8_t* src = + reinterpret_cast(source_buffer) + source_offset; // No need to Allocate scratch space (in an arena) as the memcpy // used below is expected to be synchronized. shim_xdna::bo* target_device_buffer = iree_hal_xrt_lite_buffer_handle( iree_hal_buffer_allocated_buffer(target_ref.buffer)); void* target_device_buffer_ptr = target_device_buffer->map(); - uint8_t* dst = (uint8_t*)target_device_buffer_ptr + + uint8_t* dst = reinterpret_cast(target_device_buffer_ptr) + iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset; memcpy(dst, src, target_ref.length); @@ -131,8 +131,10 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_copy_buffer( iree_device_size_t source_offset = iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset; - uint8_t* dst = (uint8_t*)target_device_buffer_ptr + target_offset; - uint8_t* src = (uint8_t*)source_device_buffer_ptr + source_offset; + uint8_t* dst = + reinterpret_cast(target_device_buffer_ptr) + target_offset; + uint8_t* src = + reinterpret_cast(source_device_buffer_ptr) + source_offset; memcpy(dst, src, target_ref.length); IREE_TRACE_ZONE_END(z0); @@ -147,12 +149,13 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_direct_command_buffer* command_buffer = - reinterpret_cast( - base_command_buffer); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_command_buffer, iree_hal_xrt_lite_direct_command_buffer_vtable, + iree_hal_xrt_lite_direct_command_buffer); // Lookup kernel parameters used for side-channeling additional launch // information from the compiler. - iree_hal_xrt_lite_native_executable* executable = - reinterpret_cast(base_executable); + iree_hal_xrt_lite_executable* executable = + iree_hal_xrt_lite_executable_cast(base_executable); iree_hal_xrt_lite_kernel_params kernel_params = executable->entry_points[entry_point]; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 738dda0f3..eda721bb1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -5,16 +5,16 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/driver/xrt-lite/api.h" -#include "util.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" #define IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT 0 struct iree_hal_xrt_lite_driver { iree_hal_resource_t resource; iree_allocator_t host_allocator; - iree_string_view_t identifier; iree_hal_xrt_lite_driver_options options; // + trailing identifier string storage + iree_string_view_t identifier; }; namespace { @@ -44,23 +44,24 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_hal_xrt_lite_driver* driver = nullptr; iree_host_size_t total_size = sizeof(*driver) + identifier.size; IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_allocator_malloc(host_allocator, total_size, (void**)&driver)); + z0, iree_allocator_malloc(host_allocator, total_size, + reinterpret_cast(&driver))); iree_hal_resource_initialize(&iree_hal_xrt_lite_driver_vtable, &driver->resource); driver->host_allocator = host_allocator; iree_string_view_append_to_buffer( identifier, &driver->identifier, - (char*)driver + total_size - identifier.size); + reinterpret_cast(driver) + total_size - identifier.size); memcpy(&driver->options, options, sizeof(*options)); - *out_driver = (iree_hal_driver_t*)driver; + *out_driver = reinterpret_cast(driver); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { - iree_hal_xrt_lite_driver* driver = - reinterpret_cast(base_driver); + iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); iree_allocator_t host_allocator = driver->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); @@ -87,7 +88,7 @@ static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( return iree_allocator_clone( host_allocator, iree_make_const_byte_span(device_infos, sizeof(device_infos)), - (void**)out_device_infos); + reinterpret_cast(out_device_infos)); } static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( @@ -96,8 +97,8 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( iree_allocator_t host_allocator, iree_hal_device_t** out_device) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_driver* driver = - reinterpret_cast(base_driver); + iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); iree_hal_xrt_lite_device_options options = driver->options.default_device_options; @@ -113,8 +114,8 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( iree_hal_device_t** out_device) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_driver* driver = - reinterpret_cast(base_driver); + iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); iree_hal_xrt_lite_device_options options = driver->options.default_device_options; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index b2fba7557..7e846f5ab 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -9,16 +9,22 @@ #include #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" -#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" #include "iree-amd-aie/schemas/pdi_executable_def_reader.h" #include "iree-amd-aie/schemas/pdi_executable_def_verifier.h" #include "iree/base/api.h" namespace { -extern const iree_hal_executable_vtable_t - iree_hal_xrt_lite_native_executable_vtable; +extern const iree_hal_executable_vtable_t iree_hal_xrt_lite_executable_vtable; } // namespace +iree_hal_xrt_lite_executable* iree_hal_xrt_lite_executable_cast( + iree_hal_executable_t* base_executable) { + return IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_executable, iree_hal_xrt_lite_executable_vtable, + iree_hal_xrt_lite_executable); +} + static iree_status_t iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( iree_const_byte_span_t flatbuffer_data) { @@ -94,7 +100,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( IREE_TRACE_ZONE_BEGIN(z0); *out_executable = nullptr; - iree_hal_xrt_lite_native_executable* executable = nullptr; + iree_hal_xrt_lite_executable* executable = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( @@ -138,7 +144,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( reinterpret_cast(executable) + sizeof(*executable) + entry_point_count * sizeof(executable->entry_points[0]))); - iree_hal_resource_initialize(&iree_hal_xrt_lite_native_executable_vtable, + iree_hal_resource_initialize(&iree_hal_xrt_lite_executable_vtable, &executable->resource); executable->host_allocator = host_allocator; executable->entry_point_count = entry_point_count; @@ -205,8 +211,10 @@ static void iree_hal_xrt_lite_native_executable_destroy( iree_hal_executable_t* base_executable) { IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_xrt_lite_native_executable* executable = - reinterpret_cast(base_executable); + iree_hal_xrt_lite_executable* executable = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_executable, + iree_hal_xrt_lite_executable_vtable, + iree_hal_xrt_lite_executable); iree_allocator_t host_allocator = executable->host_allocator; iree_allocator_free(host_allocator, executable); @@ -214,8 +222,7 @@ static void iree_hal_xrt_lite_native_executable_destroy( } namespace { -const iree_hal_executable_vtable_t iree_hal_xrt_lite_native_executable_vtable = - { - .destroy = iree_hal_xrt_lite_native_executable_destroy, +const iree_hal_executable_vtable_t iree_hal_xrt_lite_executable_vtable = { + .destroy = iree_hal_xrt_lite_native_executable_destroy, }; } // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index ac88f2192..1b6d7a58b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -25,7 +25,7 @@ struct iree_hal_xrt_lite_kernel_params { IREE_TRACE(uint32_t source_line;) }; -struct iree_hal_xrt_lite_native_executable { +struct iree_hal_xrt_lite_executable { // Abstract resource used for injecting reference counting and vtable; must be // at offset 0. iree_hal_resource_t resource; @@ -41,4 +41,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); +iree_hal_xrt_lite_executable* iree_hal_xrt_lite_executable_cast( + iree_hal_executable_t* base_executable); + #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc index 62b3ebfa4..20262e955 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -6,10 +6,9 @@ #include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" -#include - #include "iree-amd-aie/driver/xrt-lite/executable.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" #include "iree/base/api.h" #include "iree/base/tracing.h" @@ -48,7 +47,7 @@ iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( iree_hal_xrt_lite_nop_executable_cache* executable_cache = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, sizeof(*executable_cache), - (void**)&executable_cache)); + reinterpret_cast(&executable_cache))); executable_cache = new (executable_cache) iree_hal_xrt_lite_nop_executable_cache(shim_device, host_allocator); *out_executable_cache = @@ -63,8 +62,9 @@ static void iree_hal_xrt_lite_nop_executable_cache_destroy( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_nop_executable_cache* executable_cache = - reinterpret_cast( - base_executable_cache); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_executable_cache, iree_hal_xrt_lite_nop_executable_cache_vtable, + iree_hal_xrt_lite_nop_executable_cache); iree_allocator_free(executable_cache->host_allocator, executable_cache); IREE_TRACE_ZONE_END(z0); @@ -85,8 +85,9 @@ static iree_status_t iree_hal_xrt_lite_nop_executable_cache_prepare_executable( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_nop_executable_cache* executable_cache = - reinterpret_cast( - base_executable_cache); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_executable_cache, iree_hal_xrt_lite_nop_executable_cache_vtable, + iree_hal_xrt_lite_nop_executable_cache); IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_native_executable_create( diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc index c3e285b82..e6a29a17f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc @@ -36,7 +36,7 @@ iree_status_t iree_hal_xrt_lite_semaphore_create( iree_hal_xrt_lite_semaphore* semaphore = nullptr; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, sizeof(*semaphore), - (void**)&semaphore)); + reinterpret_cast(&semaphore))); semaphore = new (semaphore) iree_hal_xrt_lite_semaphore(initial_value, host_allocator); *out_semaphore = &semaphore->base; @@ -50,7 +50,9 @@ static void iree_hal_xrt_lite_semaphore_destroy( IREE_TRACE_ZONE_BEGIN(z0); iree_hal_xrt_lite_semaphore* semaphore = - reinterpret_cast(base_semaphore); + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_semaphore, + iree_hal_xrt_lite_semaphore_vtable, + iree_hal_xrt_lite_semaphore); iree_allocator_t host_allocator = semaphore->host_allocator; iree_hal_semaphore_deinitialize(&semaphore->base); iree_allocator_free(host_allocator, semaphore); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h index ff3de896c..c50e4a235 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -20,6 +20,14 @@ iree_status_t unimplemented_ok_status(Params...) { } template -void unimplemented_ok_void(Params...) {} +void unimplemented_ok_void(Params...){} +#ifndef NDEBUG +#define IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_value, vtable, subvalue_t) \ + (IREE_HAL_ASSERT_TYPE(base_value, &vtable), \ + reinterpret_cast(base_value)) +#else +#define IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_value, vtable, subvalue_t) \ + (reinterpret_cast(base_value)) +#endif #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H From e27d2c063e25048172dbf4582328a788356aca9f Mon Sep 17 00:00:00 2001 From: makslevental Date: Thu, 17 Oct 2024 13:16:34 -0400 Subject: [PATCH 26/35] remove exceptions --- .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 3 +-- .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 16 ++++------------ .../driver/xrt-lite/shim/linux/kmq/device.cpp | 16 +++++++--------- .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 6 +----- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 6 +----- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 13 ++++++++----- .../driver/xrt-lite/shim/linux/kmq/kernel.cpp | 7 +++---- .../driver/xrt-lite/shim/linux/kmq/shim_debug.h | 13 +++++-------- 8 files changed, 30 insertions(+), 50 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt index ca6cec933..e0757905f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -27,8 +27,7 @@ iree_cc_library( shim_debug.h DEPS uuid - COPTS - $<$:-fexceptions -frtti> + LLVMSupport DEFINES $<$:SHIM_XDNA_DEBUG> PUBLIC diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp index c2eec9fd9..2d266f729 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -181,11 +181,7 @@ drm_bo::drm_bo(bo &parent, const amdxdna_drm_get_bo_info &bo_info) drm_bo::~drm_bo() { if (m_handle == AMDXDNA_INVALID_BO_HANDLE) return; - try { - free_drm_bo(m_parent.m_pdev, m_handle); - } catch (const std::system_error &e) { - SHIM_DEBUG("Failed to free DRM BO: %s", e.what()); - } + free_drm_bo(m_parent.m_pdev, m_handle); } std::string bo::type_to_name() const { @@ -351,13 +347,9 @@ bo::~bo() { SHIM_DEBUG("Freeing KMQ BO, %s", describe().c_str()); munmap_bo(); - try { - detach_from_ctx(); - // If BO is in use, we should block and wait in driver - free_bo(); - } catch (const std::system_error &e) { - SHIM_DEBUG("Failed to free BO: %s", e.what()); - } + detach_from_ctx(); + // If BO is in use, we should block and wait in driver + free_bo(); } bo::bo(const pdev &p, size_t size, amdxdna_bo_type type) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 01c4a6bae..ae87fc0b3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -16,6 +16,7 @@ #include "bo.h" #include "fence.h" #include "hwctx.h" +#include "llvm/Support/ErrorHandling.h" #include "shim_debug.h" #include "xrt_mem.h" @@ -26,29 +27,26 @@ int64_t import_fd(pid_t pid, int ehdl) { #if defined(SYS_pidfd_open) && defined(SYS_pidfd_getfd) auto pidfd = syscall(SYS_pidfd_open, pid, 0); - if (pidfd < 0) - throw std::system_error(errno, std::system_category(), "pidfd_open failed"); + if (pidfd < 0) shim_xdna::shim_err(errno, "pidfd_open failed"); int64_t fd = syscall(SYS_pidfd_getfd, pidfd, ehdl, 0); if (fd < 0) { if (errno == EPERM) { - throw std::system_error( - errno, std::system_category(), + shim_xdna::shim_err( + errno, "pidfd_getfd failed, check that ptrace access mode " "allows PTRACE_MODE_ATTACH_REALCREDS. For more details please " "check /etc/sysctl.d/10-ptrace.conf"); } - throw std::system_error(errno, std::system_category(), - "pidfd_getfd failed"); + shim_xdna::shim_err(errno, "pidfd_getfd failed"); } return fd; #else - throw std::system_error( - int(std::errc::not_supported), std::system_category(), + shim_xdna::shim_err( + int(std::errc::not_supported), "Importing buffer object from different process requires XRT " " built and installed on a system with 'pidfd' kernel support"); - return -1; #endif } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp index ba48b3f9e..06fd948fc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -158,11 +158,7 @@ fence_handle::fence_handle(const fence_handle &f) fence_handle::~fence_handle() { SHIM_DEBUG("Fence going away: %d@%ld", m_syncobj_hdl, m_state); - try { - destroy_syncobj(m_pdev, m_syncobj_hdl); - } catch (const std::system_error &e) { - SHIM_DEBUG("Failed to destroy fence_handle"); - } + destroy_syncobj(m_pdev, m_syncobj_hdl); } std::unique_ptr fence_handle::share_handle() const { diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index c7fbc0e7e..8784c2cc8 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -82,11 +82,7 @@ hw_ctx::hw_ctx(device &device, const std::vector &pdi, } hw_ctx::~hw_ctx() { - try { - delete_ctx_on_device(); - } catch (const std::system_error &e) { - SHIM_DEBUG("Failed to delete context on device: %s", e.what()); - } + delete_ctx_on_device(); SHIM_DEBUG("Destroyed HW context (%d)...", m_handle); SHIM_DEBUG("Destroying KMQ HW context (%d)...", m_handle); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp index 22a29549b..b3bcc6b2b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -3,6 +3,8 @@ #include "hwq.h" +#include + #include "bo.h" #include "ert.h" #include "fence.h" @@ -28,11 +30,12 @@ int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, .seq = id, }; - try { - pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd); - } catch (const std::system_error &ex) { - if (ex.code().value() != ETIME) throw; - ret = 0; + if (::ioctl(pdev.m_dev_fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd) == -1) { + if (errno == ETIME) { + ret = 0; + } else { + shim_xdna::shim_err(errno, "DRM_IOCTL_AMDXDNA_WAIT_CMD IOCTL failed"); + } } return ret; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp index b86da244a..f20b32dc5 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp @@ -10,6 +10,7 @@ #include "amdxdna_accel.h" #include "bo.h" #include "device.h" +#include "shim_debug.h" #define MAX_EXEC_BO_SIZE 4096 @@ -64,8 +65,7 @@ void kernel::add_ctrl_bo(bo &bo_ctrl) { break; } default: - throw std::runtime_error("Unknown exec buf op code: " + - std::to_string(m_op)); + shim_err(-1, "Unknown exec buf op code: %d", m_op); } } @@ -116,8 +116,7 @@ void kernel::inc_pkt_count(uint32_t n) const { m_cmd_pkt->count += n / sizeof(int32_t); if (m_cmd_size < sizeof(m_cmd_pkt->header) + m_cmd_pkt->count * sizeof(int32_t)) - throw std::runtime_error("Size of exec buf too small: " + - std::to_string(m_cmd_size)); + shim_err(-1, "Size of exec buf too small: %d", m_cmd_size); } bo *kernel::get_exec_buf_bo() const { return m_exec_buf_bo.get(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h index bf853312e..f9e5e1785 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -10,6 +10,8 @@ #include #include +#include "llvm/Support/ErrorHandling.h" + void debugf(const char *format, ...); namespace shim_xdna { @@ -19,18 +21,13 @@ template std::string format = std::string(fmt); format += " (err=%d)"; int sz = std::snprintf(nullptr, 0, format.c_str(), args..., err) + 1; - if (sz <= 0) - throw std::system_error(sz, std::system_category(), - "could not format error string"); + if (sz <= 0) llvm::report_fatal_error("could not format error string"); auto size = static_cast(sz); std::unique_ptr buf(new char[size]); std::snprintf(buf.get(), size, format.c_str(), args..., err); - throw std::system_error(err, std::system_category(), std::string(buf.get())); -} - -[[noreturn]] inline void shim_not_supported_err(const char *msg) { - shim_err(ENOTSUP, msg); + std::string err_str(buf.get()); + llvm::report_fatal_error(err_str.c_str()); } template From 8bed954af9c386c3eda0f338bfaa9510cd5a8e48 Mon Sep 17 00:00:00 2001 From: makslevental Date: Thu, 17 Oct 2024 18:01:42 -0400 Subject: [PATCH 27/35] parameterize n_rows, n_cols --- .github/workflows/ci-linux.yml | 16 +- .../ci/amdxdna_driver_utils/amdxdna_accel.py | 789 ++++++++++++++++++ .../ci/amdxdna_driver_utils/amdxdna_ioctl.py | 217 +++++ build_tools/ci/cpu_comparison/run.py | 23 +- build_tools/ci/run_matmul_test.sh | 7 + .../src/iree-amd-aie/driver/xrt-lite/api.h | 12 +- .../iree-amd-aie/driver/xrt-lite/device.cc | 57 +- .../src/iree-amd-aie/driver/xrt-lite/device.h | 33 + .../driver/xrt-lite/direct_command_buffer.cc | 23 +- .../driver/xrt-lite/direct_command_buffer.h | 4 +- .../iree-amd-aie/driver/xrt-lite/driver.cc | 11 +- .../xrt-lite/registration/driver_module.c | 106 ++- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 9 +- .../driver/xrt-lite/shim/linux/kmq/device.h | 6 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 30 +- .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 6 +- tests/conftest.py | 4 + 17 files changed, 1261 insertions(+), 92 deletions(-) create mode 100644 build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py create mode 100644 build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/device.h diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 44a3eb03b..9e3e25508 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -160,6 +160,16 @@ jobs: source .venv/bin/activate pip install -r tests/requirements.txt + - name: Query device info + run: | + source .venv/bin/activate + echo "aie-metadata" + python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-metadata + echo "aie-version" + python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-version + echo "XRT_LITE_N_CORE_ROWS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows)" >> $GITHUB_ENV + echo "XRT_LITE_N_CORE_COLS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols)" >> $GITHUB_ENV + - name : E2E comparison of AIE to llvm-cpu run: | source .venv/bin/activate @@ -168,7 +178,9 @@ jobs: $PWD/iree-install \ $PWD/llvm-aie \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v + --reset-npu-between-runs -v \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - name: E2E correctness matmul test run: | @@ -195,5 +207,5 @@ jobs: run: | DEVICE_TEST_DIR="$PWD/iree-install/device_tests" for t in $(ls $DEVICE_TEST_DIR); do - $DEVICE_TEST_DIR/$t + $DEVICE_TEST_DIR/$t --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS done diff --git a/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py b/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py new file mode 100644 index 000000000..edcd5f260 --- /dev/null +++ b/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py @@ -0,0 +1,789 @@ +# generated using clang2py amdxdna_accel.h -o amdxdna_accel.py -k cdefstum +import ctypes + + +class AsDictMixin: + @classmethod + def as_dict(cls, self): + result = {} + if not isinstance(self, AsDictMixin): + # not a structure, assume it's already a python object + return self + if not hasattr(cls, "_fields_"): + return result + for field_tuple in cls._fields_: # noqa + field = field_tuple[0] + if field.startswith("PADDING_"): + continue + value = getattr(self, field) + type_ = type(value) + if hasattr(value, "_length_") and hasattr(value, "_type_"): + # array + if not hasattr(type_, "as_dict"): + value = [v for v in value] + else: + type_ = type_._type_ + value = [type_.as_dict(v) for v in value] + elif hasattr(value, "contents") and hasattr(value, "_type_"): + # pointer + try: + if not hasattr(type_, "as_dict"): + value = value.contents + else: + type_ = type_._type_ + value = type_.as_dict(value.contents) + except ValueError: + # nullptr + value = None + elif isinstance(value, AsDictMixin): + # other structure + value = type_.as_dict(value) + result[field] = value + return result + + +class Structure(ctypes.Structure, AsDictMixin): + def __init__(self, *args, **kwds): + # We don't want to use positional arguments fill PADDING_* fields + + args = dict(zip(self.__class__._field_names_(), args)) + args.update(kwds) + super(Structure, self).__init__(**args) + + @classmethod + def _field_names_(cls): + if hasattr(cls, "_fields_"): + return (f[0] for f in cls._fields_ if not f[0].startswith("PADDING")) + else: + return () + + @classmethod + def get_type(cls, field): + for f in cls._fields_: + if f[0] == field: + return f[1] + return None + + @classmethod + def bind(cls, bound_fields): + fields = {} + for name, type_ in cls._fields_: + if hasattr(type_, "restype"): + if name in bound_fields: + if bound_fields[name] is None: + fields[name] = type_() + else: + # use a closure to capture the callback from the loop scope + fields[name] = type_( + (lambda callback: lambda *args: callback(*args))( + bound_fields[name] + ) + ) + del bound_fields[name] + else: + # default callback implementation (does nothing) + try: + default_ = type_(0).restype().value + except TypeError: + default_ = None + fields[name] = type_( + (lambda default_: lambda *args: default_)(default_) + ) + else: + # not a callback function, use default initialization + if name in bound_fields: + fields[name] = bound_fields[name] + del bound_fields[name] + else: + fields[name] = type_() + if len(bound_fields) != 0: + raise ValueError( + "Cannot bind the following unknown callback(s) {}.{}".format( + cls.__name__, bound_fields.keys() + ) + ) + return cls(**fields) + + +class Union(ctypes.Union, AsDictMixin): + pass + + +AMDXDNA_ACCEL_H_ = True # macro +AMDXDNA_DRIVER_MAJOR = 1 # macro +AMDXDNA_DRIVER_MINOR = 0 # macro +AMDXDNA_INVALID_CMD_HANDLE = ~0 # macro +AMDXDNA_INVALID_ADDR = ~0 # macro +AMDXDNA_INVALID_CTX_HANDLE = 0 # macro +AMDXDNA_INVALID_BO_HANDLE = 0 # macro +AMDXDNA_INVALID_FENCE_HANDLE = 0 # macro +SYNC_DIRECT_TO_DEVICE = 0 # macro +SYNC_DIRECT_FROM_DEVICE = 1 # macro + +# values for enumeration 'amdxdna_drm_ioctl_id' +amdxdna_drm_ioctl_id__enumvalues = { + 0: "DRM_AMDXDNA_CREATE_HWCTX", + 1: "DRM_AMDXDNA_DESTROY_HWCTX", + 2: "DRM_AMDXDNA_CONFIG_HWCTX", + 3: "DRM_AMDXDNA_CREATE_BO", + 4: "DRM_AMDXDNA_GET_BO_INFO", + 5: "DRM_AMDXDNA_SYNC_BO", + 6: "DRM_AMDXDNA_EXEC_CMD", + 7: "DRM_AMDXDNA_WAIT_CMD", + 8: "DRM_AMDXDNA_GET_INFO", + 9: "DRM_AMDXDNA_SET_STATE", + 10: "DRM_AMDXDNA_SUBMIT_WAIT", + 11: "DRM_AMDXDNA_SUBMIT_SIGNAL", + 12: "DRM_AMDXDNA_NUM_IOCTLS", +} +DRM_AMDXDNA_CREATE_HWCTX = 0 +DRM_AMDXDNA_DESTROY_HWCTX = 1 +DRM_AMDXDNA_CONFIG_HWCTX = 2 +DRM_AMDXDNA_CREATE_BO = 3 +DRM_AMDXDNA_GET_BO_INFO = 4 +DRM_AMDXDNA_SYNC_BO = 5 +DRM_AMDXDNA_EXEC_CMD = 6 +DRM_AMDXDNA_WAIT_CMD = 7 +DRM_AMDXDNA_GET_INFO = 8 +DRM_AMDXDNA_SET_STATE = 9 +DRM_AMDXDNA_SUBMIT_WAIT = 10 +DRM_AMDXDNA_SUBMIT_SIGNAL = 11 +DRM_AMDXDNA_NUM_IOCTLS = 12 +amdxdna_drm_ioctl_id = ctypes.c_uint32 # enum + +# values for enumeration 'amdxdna_device_type' +amdxdna_device_type__enumvalues = { + -1: "AMDXDNA_DEV_TYPE_UNKNOWN", + 0: "AMDXDNA_DEV_TYPE_KMQ", + 1: "AMDXDNA_DEV_TYPE_UMQ", +} +AMDXDNA_DEV_TYPE_UNKNOWN = -1 +AMDXDNA_DEV_TYPE_KMQ = 0 +AMDXDNA_DEV_TYPE_UMQ = 1 +amdxdna_device_type = ctypes.c_int32 # enum + + +class struct_amdxdna_qos_info(Structure): + pass + + +struct_amdxdna_qos_info._pack_ = 1 # source:False +struct_amdxdna_qos_info._fields_ = [ + ("gops", ctypes.c_uint32), + ("fps", ctypes.c_uint32), + ("dma_bandwidth", ctypes.c_uint32), + ("latency", ctypes.c_uint32), + ("frame_exec_time", ctypes.c_uint32), + ("priority", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_create_hwctx(Structure): + pass + + +struct_amdxdna_drm_create_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_create_hwctx._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("qos_p", ctypes.c_uint64), + ("umq_bo", ctypes.c_uint32), + ("log_buf_bo", ctypes.c_uint32), + ("max_opc", ctypes.c_uint32), + ("num_tiles", ctypes.c_uint32), + ("mem_size", ctypes.c_uint32), + ("umq_doorbell", ctypes.c_uint32), + ("handle", ctypes.c_uint32), + ("PADDING_0", ctypes.c_ubyte * 4), +] + + +# DRM_IOCTL_AMDXDNA_CREATE_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX , struct_amdxdna_drm_create_hwctx ) # macro +class struct_amdxdna_drm_destroy_hwctx(Structure): + pass + + +struct_amdxdna_drm_destroy_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_destroy_hwctx._fields_ = [ + ("handle", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +# DRM_IOCTL_AMDXDNA_DESTROY_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX , struct_amdxdna_drm_destroy_hwctx ) # macro +class struct_amdxdna_cu_config(Structure): + pass + + +struct_amdxdna_cu_config._pack_ = 1 # source:False +struct_amdxdna_cu_config._fields_ = [ + ("cu_bo", ctypes.c_uint32), + ("cu_func", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 3), +] + + +def struct_amdxdna_hwctx_param_config_cu(num_cus, cu_configs): + assert len(cu_configs) == num_cus + + class struct_amdxdna_hwctx_param_config_cu(Structure): + pass + + struct_amdxdna_hwctx_param_config_cu._pack_ = 1 # source:False + struct_amdxdna_hwctx_param_config_cu._fields_ = [ + ("num_cus", ctypes.c_uint16), + ("pad", ctypes.c_uint16 * 3), + ("cu_configs", struct_amdxdna_cu_config * num_cus), + ] + struc = struct_amdxdna_hwctx_param_config_cu() + struc.num_cus = num_cus + struc.cu_configs = (struct_amdxdna_cu_config * num_cus)(*cu_configs) + return struc + + +# values for enumeration 'amdxdna_drm_config_hwctx_param' +amdxdna_drm_config_hwctx_param__enumvalues = { + 0: "DRM_AMDXDNA_HWCTX_CONFIG_CU", + 1: "DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF", + 2: "DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF", + 3: "DRM_AMDXDNA_HWCTX_CONFIG_NUM", +} +DRM_AMDXDNA_HWCTX_CONFIG_CU = 0 +DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF = 1 +DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF = 2 +DRM_AMDXDNA_HWCTX_CONFIG_NUM = 3 +amdxdna_drm_config_hwctx_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_config_hwctx(Structure): + pass + + +struct_amdxdna_drm_config_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_config_hwctx._fields_ = [ + ("handle", ctypes.c_uint32), + ("param_type", ctypes.c_uint32), + ("param_val", ctypes.c_uint64), + ("param_val_size", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + +# DRM_IOCTL_AMDXDNA_CONFIG_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX , struct_amdxdna_drm_config_hwctx ) # macro + +# values for enumeration 'amdxdna_bo_type' +amdxdna_bo_type__enumvalues = { + 0: "AMDXDNA_BO_INVALID", + 1: "AMDXDNA_BO_SHMEM", + 2: "AMDXDNA_BO_DEV_HEAP", + 3: "AMDXDNA_BO_DEV", + 4: "AMDXDNA_BO_CMD", + 5: "AMDXDNA_BO_DMA", +} +AMDXDNA_BO_INVALID = 0 +AMDXDNA_BO_SHMEM = 1 +AMDXDNA_BO_DEV_HEAP = 2 +AMDXDNA_BO_DEV = 3 +AMDXDNA_BO_CMD = 4 +AMDXDNA_BO_DMA = 5 +amdxdna_bo_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_create_bo(Structure): + pass + + +struct_amdxdna_drm_create_bo._pack_ = 1 # source:False +struct_amdxdna_drm_create_bo._fields_ = [ + ("flags", ctypes.c_uint64), + ("type", ctypes.c_uint32), + ("_pad", ctypes.c_uint32), + ("vaddr", ctypes.c_uint64), + ("size", ctypes.c_uint64), + ("handle", ctypes.c_uint32), + ("PADDING_0", ctypes.c_ubyte * 4), +] + + +# DRM_IOCTL_AMDXDNA_CREATE_BO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO , struct_amdxdna_drm_create_bo ) # macro +class struct_amdxdna_drm_get_bo_info(Structure): + pass + + +struct_amdxdna_drm_get_bo_info._pack_ = 1 # source:False +struct_amdxdna_drm_get_bo_info._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("handle", ctypes.c_uint32), + ("_pad", ctypes.c_uint32), + ("map_offset", ctypes.c_uint64), + ("vaddr", ctypes.c_uint64), + ("xdna_addr", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_GET_BO_INFO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO , struct_amdxdna_drm_get_bo_info ) # macro +class struct_amdxdna_drm_sync_bo(Structure): + pass + + +struct_amdxdna_drm_sync_bo._pack_ = 1 # source:False +struct_amdxdna_drm_sync_bo._fields_ = [ + ("handle", ctypes.c_uint32), + ("direction", ctypes.c_uint32), + ("offset", ctypes.c_uint64), + ("size", ctypes.c_uint64), +] + +# DRM_IOCTL_AMDXDNA_SYNC_BO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO , struct_amdxdna_drm_sync_bo ) # macro + +# values for enumeration 'amdxdna_cmd_type' +amdxdna_cmd_type__enumvalues = { + 0: "AMDXDNA_CMD_SUBMIT_EXEC_BUF", + 1: "AMDXDNA_CMD_SUBMIT_DEPENDENCY", + 2: "AMDXDNA_CMD_SUBMIT_SIGNAL", +} +AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0 +AMDXDNA_CMD_SUBMIT_DEPENDENCY = 1 +AMDXDNA_CMD_SUBMIT_SIGNAL = 2 +amdxdna_cmd_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_exec_cmd(Structure): + pass + + +struct_amdxdna_drm_exec_cmd._pack_ = 1 # source:False +struct_amdxdna_drm_exec_cmd._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("hwctx", ctypes.c_uint32), + ("type", ctypes.c_uint32), + ("cmd_handles", ctypes.c_uint64), + ("args", ctypes.c_uint64), + ("cmd_count", ctypes.c_uint32), + ("arg_count", ctypes.c_uint32), + ("seq", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_EXEC_CMD = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD , struct_amdxdna_drm_exec_cmd ) # macro +class struct_amdxdna_drm_wait_cmd(Structure): + pass + + +struct_amdxdna_drm_wait_cmd._pack_ = 1 # source:False +struct_amdxdna_drm_wait_cmd._fields_ = [ + ("hwctx", ctypes.c_uint32), + ("timeout", ctypes.c_uint32), + ("seq", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_WAIT_CMD = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD , struct_amdxdna_drm_wait_cmd ) # macro +class struct_amdxdna_drm_query_aie_status(Structure): + pass + + +struct_amdxdna_drm_query_aie_status._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_status._fields_ = [ + ("buffer", ctypes.c_uint64), + ("buffer_size", ctypes.c_uint32), + ("cols_filled", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_aie_version(Structure): + pass + + +struct_amdxdna_drm_query_aie_version._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_version._fields_ = [ + ("major", ctypes.c_uint32), + ("minor", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_aie_tile_metadata(Structure): + pass + + +struct_amdxdna_drm_query_aie_tile_metadata._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_tile_metadata._fields_ = [ + ("row_count", ctypes.c_uint16), + ("row_start", ctypes.c_uint16), + ("dma_channel_count", ctypes.c_uint16), + ("lock_count", ctypes.c_uint16), + ("event_reg_count", ctypes.c_uint16), + ("pad", ctypes.c_uint16 * 3), +] + + +class struct_amdxdna_drm_query_aie_metadata(Structure): + pass + + +struct_amdxdna_drm_query_aie_metadata._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_metadata._fields_ = [ + ("col_size", ctypes.c_uint32), + ("cols", ctypes.c_uint16), + ("rows", ctypes.c_uint16), + ("version", struct_amdxdna_drm_query_aie_version), + ("core", struct_amdxdna_drm_query_aie_tile_metadata), + ("mem", struct_amdxdna_drm_query_aie_tile_metadata), + ("shim", struct_amdxdna_drm_query_aie_tile_metadata), +] + + +class struct_amdxdna_drm_query_clock(Structure): + pass + + +struct_amdxdna_drm_query_clock._pack_ = 1 # source:False +struct_amdxdna_drm_query_clock._fields_ = [ + ("name", ctypes.c_ubyte * 16), + ("freq_mhz", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_clock_metadata(Structure): + _pack_ = 1 # source:False + _fields_ = [ + ("mp_npu_clock", struct_amdxdna_drm_query_clock), + ("h_clock", struct_amdxdna_drm_query_clock), + ] + + +# values for enumeration 'amdxdna_sensor_type' +amdxdna_sensor_type__enumvalues = { + 0: "AMDXDNA_SENSOR_TYPE_POWER", +} +AMDXDNA_SENSOR_TYPE_POWER = 0 +amdxdna_sensor_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_query_sensor(Structure): + pass + + +struct_amdxdna_drm_query_sensor._pack_ = 1 # source:False +struct_amdxdna_drm_query_sensor._fields_ = [ + ("label", ctypes.c_ubyte * 64), + ("input", ctypes.c_uint32), + ("max", ctypes.c_uint32), + ("average", ctypes.c_uint32), + ("highest", ctypes.c_uint32), + ("status", ctypes.c_ubyte * 64), + ("units", ctypes.c_ubyte * 16), + ("unitm", ctypes.c_byte), + ("type", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 6), +] + + +class struct_amdxdna_drm_query_hwctx(Structure): + pass + + +struct_amdxdna_drm_query_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_query_hwctx._fields_ = [ + ("context_id", ctypes.c_uint32), + ("start_col", ctypes.c_uint32), + ("num_col", ctypes.c_uint32), + ("pad", ctypes.c_uint32), + ("pid", ctypes.c_int64), + ("command_submissions", ctypes.c_uint64), + ("command_completions", ctypes.c_uint64), + ("migrations", ctypes.c_uint64), + ("preemptions", ctypes.c_uint64), + ("errors", ctypes.c_uint64), +] + + +class struct_amdxdna_drm_aie_mem(Structure): + pass + + +struct_amdxdna_drm_aie_mem._pack_ = 1 # source:False +struct_amdxdna_drm_aie_mem._fields_ = [ + ("col", ctypes.c_uint32), + ("row", ctypes.c_uint32), + ("addr", ctypes.c_uint32), + ("size", ctypes.c_uint32), + ("buf_p", ctypes.c_uint64), +] + + +class struct_amdxdna_drm_aie_reg(Structure): + pass + + +struct_amdxdna_drm_aie_reg._pack_ = 1 # source:False +struct_amdxdna_drm_aie_reg._fields_ = [ + ("col", ctypes.c_uint32), + ("row", ctypes.c_uint32), + ("addr", ctypes.c_uint32), + ("val", ctypes.c_uint32), +] + +# values for enumeration 'amdxdna_power_mode_type' +amdxdna_power_mode_type__enumvalues = { + 0: "POWER_MODE_DEFAULT", + 1: "POWER_MODE_LOW", + 2: "POWER_MODE_MEDIUM", + 3: "POWER_MODE_HIGH", +} +POWER_MODE_DEFAULT = 0 +POWER_MODE_LOW = 1 +POWER_MODE_MEDIUM = 2 +POWER_MODE_HIGH = 3 +amdxdna_power_mode_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_get_power_mode(Structure): + pass + + +struct_amdxdna_drm_get_power_mode._pack_ = 1 # source:False +struct_amdxdna_drm_get_power_mode._fields_ = [ + ("power_mode", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 7), +] + + +class struct_amdxdna_drm_query_firmware_version(Structure): + pass + + +struct_amdxdna_drm_query_firmware_version._pack_ = 1 # source:False +struct_amdxdna_drm_query_firmware_version._fields_ = [ + ("major", ctypes.c_uint32), + ("minor", ctypes.c_uint32), + ("patch", ctypes.c_uint32), + ("build", ctypes.c_uint32), +] + +# values for enumeration 'amdxdna_drm_get_param' +amdxdna_drm_get_param__enumvalues = { + 0: "DRM_AMDXDNA_QUERY_AIE_STATUS", + 1: "DRM_AMDXDNA_QUERY_AIE_METADATA", + 2: "DRM_AMDXDNA_QUERY_AIE_VERSION", + 3: "DRM_AMDXDNA_QUERY_CLOCK_METADATA", + 4: "DRM_AMDXDNA_QUERY_SENSORS", + 5: "DRM_AMDXDNA_QUERY_HW_CONTEXTS", + 6: "DRM_AMDXDNA_READ_AIE_MEM", + 7: "DRM_AMDXDNA_READ_AIE_REG", + 8: "DRM_AMDXDNA_QUERY_FIRMWARE_VERSION", + 9: "DRM_AMDXDNA_GET_POWER_MODE", + 10: "DRM_AMDXDNA_NUM_GET_PARAM", +} +DRM_AMDXDNA_QUERY_AIE_STATUS = 0 +DRM_AMDXDNA_QUERY_AIE_METADATA = 1 +DRM_AMDXDNA_QUERY_AIE_VERSION = 2 +DRM_AMDXDNA_QUERY_CLOCK_METADATA = 3 +DRM_AMDXDNA_QUERY_SENSORS = 4 +DRM_AMDXDNA_QUERY_HW_CONTEXTS = 5 +DRM_AMDXDNA_READ_AIE_MEM = 6 +DRM_AMDXDNA_READ_AIE_REG = 7 +DRM_AMDXDNA_QUERY_FIRMWARE_VERSION = 8 +DRM_AMDXDNA_GET_POWER_MODE = 9 +DRM_AMDXDNA_NUM_GET_PARAM = 10 +amdxdna_drm_get_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_get_info(Structure): + pass + + +struct_amdxdna_drm_get_info._pack_ = 1 # source:False +struct_amdxdna_drm_get_info._fields_ = [ + ("param", ctypes.c_uint32), + ("buffer_size", ctypes.c_uint32), + ("buffer", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_GET_INFO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO , struct_amdxdna_drm_get_info ) # macro +class struct_amdxdna_drm_set_power_mode(Structure): + pass + + +struct_amdxdna_drm_set_power_mode._pack_ = 1 # source:False +struct_amdxdna_drm_set_power_mode._fields_ = [ + ("power_mode", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 7), +] + +# values for enumeration 'amdxdna_drm_set_param' +amdxdna_drm_set_param__enumvalues = { + 0: "DRM_AMDXDNA_SET_POWER_MODE", + 1: "DRM_AMDXDNA_WRITE_AIE_MEM", + 2: "DRM_AMDXDNA_WRITE_AIE_REG", + 3: "DRM_AMDXDNA_NUM_SET_PARAM", +} +DRM_AMDXDNA_SET_POWER_MODE = 0 +DRM_AMDXDNA_WRITE_AIE_MEM = 1 +DRM_AMDXDNA_WRITE_AIE_REG = 2 +DRM_AMDXDNA_NUM_SET_PARAM = 3 +amdxdna_drm_set_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_set_state(Structure): + pass + + +struct_amdxdna_drm_set_state._pack_ = 1 # source:False +struct_amdxdna_drm_set_state._fields_ = [ + ("param", ctypes.c_uint32), + ("buffer_size", ctypes.c_uint32), + ("buffer", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_SET_STATE = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE , struct_amdxdna_drm_set_state ) # macro +class struct_amdxdna_drm_syncobjs(Structure): + pass + + +struct_amdxdna_drm_syncobjs._pack_ = 1 # source:False +struct_amdxdna_drm_syncobjs._fields_ = [ + ("handles", ctypes.c_uint64), + ("points", ctypes.c_uint64), + ("count", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +def struct_amdxdna_cmd_chain(command_count): + class struct_amdxdna_cmd_chain(Structure): + pass + + struct_amdxdna_cmd_chain._pack_ = 1 # source:False + struct_amdxdna_cmd_chain._fields_ = [ + ("command_count", ctypes.c_uint32), + ("submit_index", ctypes.c_uint32), + ("error_index", ctypes.c_uint32), + ("reserved", ctypes.c_uint32 * 3), + ("data", ctypes.c_uint64 * command_count), + ] + return struct_amdxdna_cmd_chain + + +def struct_amdxdna_cmd(count): + class struct_amdxdna_cmd(Structure): + pass + + struct_amdxdna_cmd._pack_ = 1 # source:False + struct_amdxdna_cmd._fields_ = [ + ("state", ctypes.c_uint32, 4), + ("unused", ctypes.c_uint32, 6), + ("extra_cu_masks", ctypes.c_uint32, 2), + ("count", ctypes.c_uint32, 11), + ("opcode", ctypes.c_uint32, 5), + ("reserved", ctypes.c_uint32, 4), + ("data", ctypes.c_uint32 * count), + ] + return struct_amdxdna_cmd + + +# DRM_IOCTL_AMDXDNA_SUBMIT_WAIT = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT , struct_amdxdna_drm_syncobjs ) # macro +# DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL , struct_amdxdna_drm_syncobjs ) # macro +__all__ = [ + "AMDXDNA_ACCEL_H_", + "AMDXDNA_BO_CMD", + "AMDXDNA_BO_DEV", + "AMDXDNA_BO_DEV_HEAP", + "AMDXDNA_BO_DMA", + "AMDXDNA_BO_INVALID", + "AMDXDNA_BO_SHMEM", + "AMDXDNA_CMD_SUBMIT_DEPENDENCY", + "AMDXDNA_CMD_SUBMIT_EXEC_BUF", + "AMDXDNA_CMD_SUBMIT_SIGNAL", + "AMDXDNA_DEV_TYPE_KMQ", + "AMDXDNA_DEV_TYPE_UMQ", + "AMDXDNA_DEV_TYPE_UNKNOWN", + "AMDXDNA_DRIVER_MAJOR", + "AMDXDNA_DRIVER_MINOR", + "AMDXDNA_INVALID_ADDR", + "AMDXDNA_INVALID_BO_HANDLE", + "AMDXDNA_INVALID_CMD_HANDLE", + "AMDXDNA_INVALID_CTX_HANDLE", + "AMDXDNA_INVALID_FENCE_HANDLE", + "AMDXDNA_SENSOR_TYPE_POWER", + "DRM_AMDXDNA_CONFIG_HWCTX", + "DRM_AMDXDNA_CREATE_BO", + "DRM_AMDXDNA_CREATE_HWCTX", + "DRM_AMDXDNA_DESTROY_HWCTX", + "DRM_AMDXDNA_EXEC_CMD", + "DRM_AMDXDNA_GET_BO_INFO", + "DRM_AMDXDNA_GET_INFO", + "DRM_AMDXDNA_GET_POWER_MODE", + "DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF", + "DRM_AMDXDNA_HWCTX_CONFIG_CU", + "DRM_AMDXDNA_HWCTX_CONFIG_NUM", + "DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF", + "DRM_AMDXDNA_NUM_GET_PARAM", + "DRM_AMDXDNA_NUM_IOCTLS", + "DRM_AMDXDNA_NUM_SET_PARAM", + "DRM_AMDXDNA_QUERY_AIE_METADATA", + "DRM_AMDXDNA_QUERY_AIE_STATUS", + "DRM_AMDXDNA_QUERY_AIE_VERSION", + "DRM_AMDXDNA_QUERY_CLOCK_METADATA", + "DRM_AMDXDNA_QUERY_FIRMWARE_VERSION", + "DRM_AMDXDNA_QUERY_HW_CONTEXTS", + "DRM_AMDXDNA_QUERY_SENSORS", + "DRM_AMDXDNA_READ_AIE_MEM", + "DRM_AMDXDNA_READ_AIE_REG", + "DRM_AMDXDNA_SET_POWER_MODE", + "DRM_AMDXDNA_SET_STATE", + "DRM_AMDXDNA_SUBMIT_SIGNAL", + "DRM_AMDXDNA_SUBMIT_WAIT", + "DRM_AMDXDNA_SYNC_BO", + "DRM_AMDXDNA_WAIT_CMD", + "DRM_AMDXDNA_WRITE_AIE_MEM", + "DRM_AMDXDNA_WRITE_AIE_REG", + "POWER_MODE_DEFAULT", + "POWER_MODE_HIGH", + "POWER_MODE_LOW", + "POWER_MODE_MEDIUM", + "SYNC_DIRECT_FROM_DEVICE", + "SYNC_DIRECT_TO_DEVICE", + "amdxdna_bo_type", + "amdxdna_cmd_type", + "amdxdna_device_type", + "amdxdna_drm_config_hwctx_param", + "amdxdna_drm_get_param", + "amdxdna_drm_ioctl_id", + "amdxdna_drm_set_param", + "amdxdna_power_mode_type", + "amdxdna_sensor_type", + "struct_amdxdna_cu_config", + "struct_amdxdna_drm_aie_mem", + "struct_amdxdna_drm_aie_reg", + "struct_amdxdna_drm_config_hwctx", + "struct_amdxdna_drm_create_bo", + "struct_amdxdna_drm_create_hwctx", + "struct_amdxdna_drm_destroy_hwctx", + "struct_amdxdna_drm_exec_cmd", + "struct_amdxdna_drm_get_bo_info", + "struct_amdxdna_drm_get_info", + "struct_amdxdna_drm_get_power_mode", + "struct_amdxdna_drm_query_aie_metadata", + "struct_amdxdna_drm_query_aie_status", + "struct_amdxdna_drm_query_aie_tile_metadata", + "struct_amdxdna_drm_query_aie_version", + "struct_amdxdna_drm_query_clock", + "struct_amdxdna_drm_query_clock_metadata", + "struct_amdxdna_drm_query_firmware_version", + "struct_amdxdna_drm_query_hwctx", + "struct_amdxdna_drm_query_sensor", + "struct_amdxdna_drm_set_power_mode", + "struct_amdxdna_drm_set_state", + "struct_amdxdna_drm_sync_bo", + "struct_amdxdna_drm_syncobjs", + "struct_amdxdna_drm_wait_cmd", + "struct_amdxdna_hwctx_param_config_cu", + "struct_amdxdna_qos_info", + "struct_amdxdna_cmd_chain", + "struct_amdxdna_cmd", +] diff --git a/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py b/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py new file mode 100644 index 000000000..38aafabc0 --- /dev/null +++ b/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py @@ -0,0 +1,217 @@ +import argparse +import array +import ctypes +import ctypes.util +import fcntl +import pathlib +import re +import struct +from argparse import Namespace +from pprint import pformat + +import amdxdna_accel +from amdxdna_accel import ( + struct_amdxdna_drm_query_aie_version, + struct_amdxdna_drm_get_info, + struct_amdxdna_drm_query_aie_metadata, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_AIE_METADATA, +) + +_IOC_NRBITS = 8 +_IOC_TYPEBITS = 8 +_IOC_SIZEBITS = 14 +_IOC_DIRBITS = 2 + +_IOC_NRMASK = (1 << _IOC_NRBITS) - 1 +_IOC_TYPEMASK = (1 << _IOC_TYPEBITS) - 1 +_IOC_SIZEMASK = (1 << _IOC_SIZEBITS) - 1 +_IOC_DIRMASK = (1 << _IOC_DIRBITS) - 1 + +_IOC_NRSHIFT = 0 +_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS +_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS +_IOC_DIRSHIFT = _IOC_SIZESHIFT + _IOC_SIZEBITS + +IOC_NONE = 0 +IOC_WRITE = 1 +IOC_READ = 2 + + +def _IOC(dir, type, nr, size): + assert dir <= _IOC_DIRMASK, dir + assert type <= _IOC_TYPEMASK, type + assert nr <= _IOC_NRMASK, nr + assert size <= _IOC_SIZEMASK, size + return ( + (dir << _IOC_DIRSHIFT) + | (type << _IOC_TYPESHIFT) + | (nr << _IOC_NRSHIFT) + | (size << _IOC_SIZESHIFT) + ) + + +def _IOC_TYPECHECK(t): + if isinstance(t, (memoryview, bytearray)): + size = len(t) + elif isinstance(t, struct.Struct): + size = t.size + elif isinstance(t, array.array): + size = t.itemsize * len(t) + else: + size = ctypes.sizeof(t) + assert size <= _IOC_SIZEMASK, size + return size + + +def _IOWR(type, nr, size): + return _IOC(IOC_READ | IOC_WRITE, type, nr, _IOC_TYPECHECK(size)) + + +def get_struct(argp, stype): + return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents + + +def get_void_ptr_to_struct(s): + ptr = ctypes.pointer(s) + return ctypes.cast(ptr, ctypes.c_void_p) + + +def format_struct(s): + return pformat(s.as_dict(s)) + + +# +DRM_IOCTL_BASE = ord("d") +DRM_COMMAND_BASE = 0x40 + + +def DRM_IOWR(nr, type): + return _IOWR(DRM_IOCTL_BASE, nr, type) + + +def ioctls_from_header(): + hdr = ( + (pathlib.Path(__file__).parent / "amdxdna_accel.py") + .read_text() + .replace("\\\n", "") + ) + pattern = "DRM_IOCTL_AMDXDNA_([A-Z0-9_]+) = DRM_IOWR \( DRM_COMMAND_BASE \+ DRM_AMDXDNA_([A-Z0-9_]+) , struct_amdxdna_drm_([a-z0-9_]+) \)" + matches = re.findall(pattern, hdr, re.MULTILINE) + ioctls = Namespace() + for name, offset, sname in matches: + assert name == offset + offset = f"DRM_AMDXDNA_{name}" + assert hasattr(amdxdna_accel, offset) + offset = getattr(amdxdna_accel, offset) + struc = getattr(amdxdna_accel, "struct_amdxdna_drm_" + sname) + setattr( + ioctls, + f"DRM_IOCTL_AMDXDNA_{name}", + DRM_IOWR(DRM_COMMAND_BASE + offset, struc), + ) + + return ioctls + + +ioctls = ioctls_from_header() + + +def get_aie_version(drv_fd): + version = struct_amdxdna_drm_query_aie_version() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_VERSION, + ctypes.sizeof(struct_amdxdna_drm_query_aie_version), + get_void_ptr_to_struct(version).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + + return version.major, version.minor + + +def get_aie_metadata(drv_fd): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + + return format_struct(metadata) + + +def get_core_n_rows(drv_fd): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + return metadata.core.row_count + + +def find_npu_device(): + drvpath = pathlib.Path("/sys/bus/pci/drivers/amdxdna") + for file in drvpath.iterdir(): + if file.is_symlink(): + actual_path = (drvpath / file.readlink()).resolve() + if str(actual_path).startswith("/sys/devices/pci"): + return actual_path + raise RuntimeError("npu device not found") + + +def read_vbnv(npu_device_path): + f = open(npu_device_path / "vbnv") + vbnv = f.read() + assert vbnv.startswith("RyzenAI-") + return vbnv.split("-")[-1].strip() + + +def get_core_n_cols(drv_fd, npu_device): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + if npu_device == "npu1": + # phoenix + return metadata.cols - 1 + elif npu_device == "npu4": + # strix + return metadata.cols + + return NotImplementedError(f"unrecognized {npu_device=}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--npu-device", action="store_true") + parser.add_argument("--num-rows", action="store_true") + parser.add_argument("--num-cols", action="store_true") + parser.add_argument("--aie-metadata", action="store_true") + parser.add_argument("--aie-version", action="store_true") + args = parser.parse_args() + + drv_path = "/dev/accel/accel0" + drv_fd = open(drv_path, "r+") + npu_device_path = find_npu_device() + npu_device = read_vbnv(npu_device_path) + + if args.npu_device: + print(npu_device) + if args.num_rows: + print(get_core_n_rows(drv_fd)) + if args.num_cols: + print(get_core_n_cols(drv_fd, npu_device)) + if args.aie_metadata: + print(get_aie_metadata(drv_fd)) + if args.aie_version: + print(get_aie_version(drv_fd)) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 8ddd22ddc..96f060d4c 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -197,6 +197,11 @@ def generate_aie_output(config, aie_vmfb, input_args, function_name, name, outpu ] if function_name: run_args += [f"--function={function_name}"] + if config.xrt_lite_n_core_rows is not None: + run_args += [f"--xrt_lite_n_core_rows={config.xrt_lite_n_core_rows}"] + if config.xrt_lite_n_core_cols is not None: + run_args += [f"--xrt_lite_n_core_cols={config.xrt_lite_n_core_cols}"] + if config.reset_npu_between_runs: shell_out(config.reset_npu_script, verbose=config.verbose) @@ -269,6 +274,8 @@ def __init__( do_not_run_aie, additional_aie_compilation_flags, device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ): self.output_dir = output_dir self.iree_install_dir = iree_install_dir @@ -286,6 +293,8 @@ def __init__( self.do_not_run_aie = do_not_run_aie self.additional_aie_compilation_flags = additional_aie_compilation_flags self.device_hal = device_hal + self.xrt_lite_n_core_rows = xrt_lite_n_core_rows + self.xrt_lite_n_core_cols = xrt_lite_n_core_cols # Try get the xrt and (linux) kernel versions. self.linux_kernel = "undetermined" @@ -849,7 +858,9 @@ def all_tests( do_not_run_aie, test_set, additional_aie_compilation_flags, - device_hal + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ): """ There are a few ways to add tests to this script: @@ -891,7 +902,9 @@ def all_tests( reset_npu_between_runs, do_not_run_aie, additional_aie_compilation_flags, - device_hal + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ) if verbose: print(config) @@ -946,6 +959,8 @@ def all_tests( parser.add_argument("peano_install_dir", type=abs_path) parser.add_argument("--xrt-dir", type=abs_path) parser.add_argument("--vitis-dir", type=abs_path) + parser.add_argument("--xrt_lite_n_core_rows", type=int) + parser.add_argument("--xrt_lite_n_core_cols", type=int) # TODO(newling) make bool options boolean, not integer (tried but had issues) parser.add_argument( @@ -1052,5 +1067,7 @@ def all_tests( args.do_not_run_aie, test_set_list, args.additional_aie_compilation_flags, - args.device_hal + args.device_hal, + args.xrt_lite_n_core_rows, + args.xrt_lite_n_core_cols, ) diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index c1c5a6d56..6a6146487 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -464,6 +464,13 @@ function run_matmul_test() { --device=${DEVICE_HAL} \ --max_elements_to_check=${max_elements_to_check}" + if [ -n "$XRT_LITE_N_CORE_ROWS" ]; then + COMMAND="${COMMAND} --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS" + fi + if [ -n "$XRT_LITE_N_CORE_COLS" ]; then + COMMAND="${COMMAND} --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS" + fi + total_num_runs=$(( num_repeat_runs * num_corruption_repeat_runs)) echo "**** Running '${name}' matmul test ${total_num_runs} times (command ${COMMAND}) ****" for i in $(seq 1 $num_repeat_runs); do diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index 62b2a9fae..c969388ba 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -10,13 +10,16 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -struct iree_hal_xrt_lite_device_options {}; +struct iree_hal_xrt_lite_device_params { + int32_t n_core_rows; + int32_t n_core_cols; +}; IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( - struct iree_hal_xrt_lite_device_options* out_params); + struct iree_hal_xrt_lite_device_params* out_params); struct iree_hal_xrt_lite_driver_options { - struct iree_hal_xrt_lite_device_options default_device_options; + struct iree_hal_xrt_lite_device_params device_params; }; IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( @@ -30,11 +33,12 @@ IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, const struct iree_hal_xrt_lite_driver_options* options, + const struct iree_hal_xrt_lite_device_params* device_params, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver); IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( iree_string_view_t identifier, - const struct iree_hal_xrt_lite_device_options* options, + const struct iree_hal_xrt_lite_device_params* params, iree_allocator_t host_allocator, iree_hal_device_t** out_device); #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 6239b10e4..c9d3bb64e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -8,6 +8,7 @@ #include "iree-amd-aie/driver/xrt-lite/allocator.h" #include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/device.h" #include "iree-amd-aie/driver/xrt-lite/direct_command_buffer.h" #include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" #include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" @@ -21,38 +22,25 @@ namespace { extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; } -struct iree_hal_xrt_lite_device { - iree_hal_resource_t resource; - iree_allocator_t host_allocator; - // TODO(max): not used because "device allocations" are performed through - // device - iree_hal_allocator_t* device_allocator; - // block pool used for command buffer allocations, uses a larger block size - // since command buffers can contain inlined data - iree_arena_block_pool_t block_pool; - shim_xdna::device* shim_device; - // should come last; see the definition of total_size below in - // iree_hal_xrt_lite_device_create - iree_string_view_t identifier; - - iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_options* options, - iree_allocator_t host_allocator) { - IREE_ASSERT_ARGUMENT(options); - IREE_TRACE_ZONE_BEGIN(z0); - - iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); - this->host_allocator = host_allocator; - shim_device = new shim_xdna::device; - - iree_status_t status = iree_hal_xrt_lite_allocator_create( - host_allocator, shim_device, &device_allocator); - IREE_ASSERT(iree_status_is_ok(status)); - iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, - &block_pool); +iree_hal_xrt_lite_device::iree_hal_xrt_lite_device( + const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator) { + IREE_ASSERT_ARGUMENT(options); + IREE_TRACE_ZONE_BEGIN(z0); - IREE_TRACE_ZONE_END(z0); - } -}; + iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); + this->host_allocator = host_allocator; + shim_device = + new shim_xdna::device(options->n_core_rows, options->n_core_cols); + + iree_status_t status = iree_hal_xrt_lite_allocator_create( + host_allocator, shim_device, &device_allocator); + IREE_ASSERT(iree_status_is_ok(status)); + iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, + &block_pool); + + IREE_TRACE_ZONE_END(z0); +} static iree_status_t iree_hal_xrt_lite_device_create_executable_cache( iree_hal_device_t* base_device, iree_string_view_t identifier, @@ -123,8 +111,7 @@ static iree_status_t iree_hal_xrt_lite_device_queue_execute( IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_xrt_lite_direct_command_buffer_create( - device->shim_device, device->device_allocator, mode, - IREE_HAL_COMMAND_CATEGORY_ANY, + device, mode, IREE_HAL_COMMAND_CATEGORY_ANY, /*binding_capacity=*/0, &device->block_pool, device->host_allocator, &xrt_command_buffer)); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -244,7 +231,7 @@ static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( } void iree_hal_xrt_lite_device_options_initialize( - iree_hal_xrt_lite_device_options* out_options) { + iree_hal_xrt_lite_device_params* out_options) { IREE_TRACE_ZONE_BEGIN(z0); memset(out_options, 0, sizeof(*out_options)); @@ -254,7 +241,7 @@ void iree_hal_xrt_lite_device_options_initialize( iree_status_t iree_hal_xrt_lite_device_create( iree_string_view_t identifier, - const iree_hal_xrt_lite_device_options* options, + const iree_hal_xrt_lite_device_params* options, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_device); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h new file mode 100644 index 000000000..ad3141e88 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h @@ -0,0 +1,33 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ + +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/internal/arena.h" +#include "iree/hal/api.h" + +struct iree_hal_xrt_lite_device { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + // TODO(max): not used because "device allocations" are performed through + // device + iree_hal_allocator_t* device_allocator; + // block pool used for command buffer allocations, uses a larger block size + // since command buffers can contain inlined data + iree_arena_block_pool_t block_pool; + shim_xdna::device* shim_device; + // should come last; see the definition of total_size below in + // iree_hal_xrt_lite_device_create + iree_string_view_t identifier; + + iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator); +}; + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index f5a79dc10..5861ebd8b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -22,7 +22,7 @@ struct iree_hal_xrt_lite_direct_command_buffer { // Staging arena used for host->device transfers. iree_arena_allocator_t arena; - shim_xdna::device* shim_device; + iree_hal_xrt_lite_device* device; }; namespace { @@ -31,13 +31,12 @@ extern const iree_hal_command_buffer_vtable_t } // namespace iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, - iree_hal_command_buffer_mode_t mode, + iree_hal_xrt_lite_device* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_command_buffer_t** out_command_buffer) { - IREE_ASSERT_ARGUMENT(device_allocator); + IREE_ASSERT_ARGUMENT(device); IREE_ASSERT_ARGUMENT(out_command_buffer); *out_command_buffer = nullptr; if (binding_capacity > 0) { @@ -57,12 +56,12 @@ iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( mode, binding_capacity), reinterpret_cast(&command_buffer))); iree_hal_command_buffer_initialize( - device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY, - binding_capacity, + device->device_allocator, mode, command_categories, + IREE_HAL_QUEUE_AFFINITY_ANY, binding_capacity, reinterpret_cast(command_buffer) + sizeof(*command_buffer), &iree_hal_xrt_lite_direct_command_buffer_vtable, &command_buffer->base); command_buffer->host_allocator = host_allocator; - command_buffer->shim_device = shim_device; + command_buffer->device = device; iree_arena_initialize(block_pool, &command_buffer->arena); iree_status_t status = iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set); @@ -164,15 +163,17 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( &executable)); size_t ctrl_code_size = kernel_params.asm_inst.size() * sizeof(uint32_t); - auto bo_ctrl_code = command_buffer->shim_device->alloc_bo( + auto bo_ctrl_code = command_buffer->device->shim_device->alloc_bo( ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); uint32_t* instr_buffer = static_cast(bo_ctrl_code->map()); memcpy(instr_buffer, kernel_params.asm_inst.data(), ctrl_code_size); bo_ctrl_code->sync(shim_xdna::direction::host2device); - shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); - shim_xdna::hw_ctx context = command_buffer->shim_device->create_hw_context( - kernel_params.pdi, kernel_params.kernel_name); + shim_xdna::kernel ebuf(command_buffer->device->shim_device->get_pdev(), + ERT_START_CU); + shim_xdna::hw_ctx context = + command_buffer->device->shim_device->create_hw_context( + kernel_params.pdi, kernel_params.kernel_name); shim_xdna::cuidx_t cu_idx = context.open_cu_context(kernel_params.kernel_name); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h index 1612c9509..da797f20f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -7,6 +7,7 @@ #ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ #define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ +#include "iree-amd-aie/driver/xrt-lite/device.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree/base/internal/arena.h" #include "iree/hal/api.h" @@ -14,8 +15,7 @@ // `out_command_buffer` must be released by the caller (see // iree_hal_command_buffer_release). iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, - iree_hal_command_buffer_mode_t mode, + iree_hal_xrt_lite_device* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index eda721bb1..87a7b9c1f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -26,8 +26,7 @@ void iree_hal_xrt_lite_driver_options_initialize( IREE_TRACE_ZONE_BEGIN(z0); memset(out_options, 0, sizeof(*out_options)); - iree_hal_xrt_lite_device_options_initialize( - &out_options->default_device_options); + iree_hal_xrt_lite_device_options_initialize(&out_options->device_params); IREE_TRACE_ZONE_END(z0); } @@ -35,6 +34,7 @@ void iree_hal_xrt_lite_driver_options_initialize( IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, const iree_hal_xrt_lite_driver_options* options, + const iree_hal_xrt_lite_device_params* device_params, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_driver); @@ -53,6 +53,7 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( identifier, &driver->identifier, reinterpret_cast(driver) + total_size - identifier.size); memcpy(&driver->options, options, sizeof(*options)); + memcpy(&driver->options.device_params, device_params, sizeof(*device_params)); *out_driver = reinterpret_cast(driver); IREE_TRACE_ZONE_END(z0); @@ -99,8 +100,7 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); - iree_hal_xrt_lite_device_options options = - driver->options.default_device_options; + iree_hal_xrt_lite_device_params options = driver->options.device_params; IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_device_create(driver->identifier, &options, @@ -116,8 +116,7 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); - iree_hal_xrt_lite_device_options options = - driver->options.default_device_options; + iree_hal_xrt_lite_device_params options = driver->options.device_params; IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_device_create(driver->identifier, &options, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c index 928305857..45617cbe3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -8,38 +8,128 @@ #include "iree-amd-aie/driver/xrt-lite/api.h" #include "iree/base/api.h" +#include "iree/base/internal/flags.h" + +IREE_FLAG(int32_t, xrt_lite_n_core_rows, 4, + "Number of core rows to use on NPU."); +IREE_FLAG(int32_t, xrt_lite_n_core_cols, 4, + "Number of core cols to use on NPU."); + +static const iree_string_view_t key_xrt_lite_n_core_rows = + iree_string_view_literal("xrt_lite_n_core_rows"); +static const iree_string_view_t key_xrt_lite_n_core_cols = + iree_string_view_literal("xrt_lite_n_core_cols"); static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( void* self, iree_host_size_t* out_driver_info_count, const iree_hal_driver_info_t** out_driver_infos) { + IREE_TRACE_ZONE_BEGIN(z0); + static const iree_hal_driver_info_t default_driver_info = { .driver_name = IREE_SVL("xrt-lite"), .full_name = IREE_SVL("XRT-LITE driver (for AIE)"), }; *out_driver_info_count = 1; *out_driver_infos = &default_driver_info; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_parse_flags( + iree_string_pair_builder_t* builder) { + IREE_TRACE_ZONE_BEGIN(z0); + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_string_pair_builder_add_int32(builder, key_xrt_lite_n_core_rows, + FLAG_xrt_lite_n_core_rows)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_string_pair_builder_add_int32(builder, key_xrt_lite_n_core_cols, + FLAG_xrt_lite_n_core_cols)); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_populate_options( + iree_allocator_t host_allocator, + struct iree_hal_xrt_lite_driver_options* driver_options, + struct iree_hal_xrt_lite_device_params* device_params, + iree_host_size_t pairs_size, iree_string_pair_t* pairs) { + IREE_TRACE_ZONE_BEGIN(z0); + + for (iree_host_size_t i = 0; i < pairs_size; ++i) { + iree_string_view_t key = pairs[i].key; + iree_string_view_t value = pairs[i].value; + int32_t ivalue; + + if (iree_string_view_equal(key, key_xrt_lite_n_core_rows)) { + if (!iree_string_view_atoi_int32(value, &ivalue)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_rows' expected to be int. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->n_core_rows = ivalue; + } else if (iree_string_view_equal(key, key_xrt_lite_n_core_cols)) { + if (!iree_string_view_atoi_int32(value, &ivalue)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_cols' expected to be int. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->n_core_cols = ivalue; + } else { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, + "Unrecognized options: %.*s", (int)key.size, + key.data); + } + } + + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { + IREE_TRACE_ZONE_BEGIN(z0); + if (!iree_string_view_equal(driver_name, IREE_SV("xrt-lite"))) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_UNAVAILABLE, "no driver '%.*s' is provided by this factory", (int)driver_name.size, driver_name.data); } - // TODO(max): populate options from flags. This driver module file is only - // used in native tools that have access to the flags library. Programmatic - // creation of the driver and devices will bypass this file and pass the - // options via this struct or key-value string parameters. - struct iree_hal_xrt_lite_driver_options options; - iree_hal_xrt_lite_driver_options_initialize(&options); + struct iree_hal_xrt_lite_driver_options driver_options; + iree_hal_xrt_lite_driver_options_initialize(&driver_options); + struct iree_hal_xrt_lite_device_params device_params; + iree_hal_xrt_lite_device_options_initialize(&device_params); + + iree_string_pair_builder_t flag_option_builder; + iree_string_pair_builder_initialize(host_allocator, &flag_option_builder); + iree_status_t status = + iree_hal_xrt_lite_driver_parse_flags(&flag_option_builder); + + if (iree_status_is_ok(status)) { + IREE_TRACE_ZONE_END(z0); + status = iree_hal_xrt_lite_driver_populate_options( + host_allocator, &driver_options, &device_params, + iree_string_pair_builder_size(&flag_option_builder), + iree_string_pair_builder_pairs(&flag_option_builder)); + } else { + IREE_TRACE_ZONE_END(z0); + return status; + } - iree_status_t status = iree_hal_xrt_lite_driver_create( - driver_name, &options, host_allocator, out_driver); + status = iree_hal_xrt_lite_driver_create( + driver_name, &driver_options, &device_params, host_allocator, out_driver); + IREE_TRACE_ZONE_END(z0); return status; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index ae87fc0b3..6a591e9ec 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -141,7 +141,10 @@ void *pdev::mmap(void *addr, size_t len, int prot, int flags, return ret; } -device::device() { SHIM_DEBUG("Created KMQ device"); } +device::device(uint32_t n_rows, uint32_t n_cols) + : n_rows(n_rows), n_cols(n_cols) { + SHIM_DEBUG("Created KMQ device"); +} device::~device() { SHIM_DEBUG("Destroying KMQ device"); } @@ -150,12 +153,12 @@ const pdev &device::get_pdev() const { return m_pdev; } hw_ctx device::create_hw_context(const std::vector &pdi, const std::string &cu_name, const std::map &qos) { - return hw_ctx(*this, pdi, cu_name, qos); + return {*this, pdi, cu_name, n_rows, n_cols, qos}; } hw_ctx device::create_hw_context(const std::vector &pdi, const std::string &cu_name) { - return hw_ctx(*this, pdi, cu_name); + return {*this, pdi, cu_name, n_rows, n_cols}; } std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index f483960e1..8ace4e79d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -28,11 +28,11 @@ struct pdev { struct device { enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; - - mutable std::mutex m_mutex; pdev m_pdev; + uint32_t n_rows; + uint32_t n_cols; - device(); + device(uint32_t n_rows, uint32_t n_cols); ~device(); std::unique_ptr import_bo(int ehdl) const; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index 8784c2cc8..20a94efd7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -14,8 +14,13 @@ namespace shim_xdna { hw_ctx::hw_ctx(device &dev, const std::map &qos, std::unique_ptr q, const std::vector &pdi, - const std::string &cu_name, size_t functional) - : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols) + : m_device(dev), + m_q(std::move(q)), + m_num_rows(n_rows), + m_num_cols(n_cols), + m_doorbell(0), + m_log_buf(nullptr) { SHIM_DEBUG("Creating HW context..."); for (auto &[key, value] : qos) { @@ -33,18 +38,21 @@ hw_ctx::hw_ctx(device &dev, const std::map &qos, m_qos.priority = value; } - m_cu_info.push_back({.m_name = cu_name, .m_func = functional, .m_pdi = pdi}); + // TODO(max): multiple pdis? + m_cu_info.push_back( + {.m_name = cu_name, .m_func = /*functional*/ 0, .m_pdi = pdi}); if (m_cu_info.empty()) shim_err(EINVAL, "No valid DPU kernel found in xclbin"); - m_ops_per_cycle = 2048 /*aie_partition.ops_per_cycle*/; - m_num_cols = 4 /*aie_partition.ncol*/; + // TODO(max): configure this + m_ops_per_cycle = 2048; } hw_ctx::hw_ctx(device &device, const std::vector &pdi, - const std::string &cu_name, + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols, const std::map &qos) - : hw_ctx(device, qos, std::make_unique(device), pdi, cu_name) { + : hw_ctx(device, qos, std::make_unique(device), pdi, cu_name, n_rows, + n_cols) { create_ctx_on_device(); std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + m_cu_info.size() * @@ -118,11 +126,7 @@ void hw_ctx::create_ctx_on_device() { arg.qos_p = reinterpret_cast(&m_qos); arg.umq_bo = m_q->m_queue_boh; arg.max_opc = m_ops_per_cycle; - // TODO(max) - // throw std::runtime_error("TODO(max): core_rows"); - // arg.num_tiles = m_num_cols * - // xrt_core::device_query(&m_device).core_rows; - arg.num_tiles = m_num_cols * 4; + arg.num_tiles = m_num_rows * m_num_cols; arg.log_buf_bo = m_log_bo ? m_log_bo->get_drm_bo_handle() : AMDXDNA_INVALID_BO_HANDLE; m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &arg); @@ -133,7 +137,7 @@ void hw_ctx::create_ctx_on_device() { m_q->bind_hwctx(this); } -void hw_ctx::delete_ctx_on_device() { +void hw_ctx::delete_ctx_on_device() const { if (m_handle == AMDXDNA_INVALID_CTX_HANDLE) return; m_q->unbind_hwctx(); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index b989c60ce..7a169e270 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -43,6 +43,7 @@ struct hw_ctx { std::vector m_cu_info; std::unique_ptr m_q; uint32_t m_ops_per_cycle; + uint32_t m_num_rows; uint32_t m_num_cols; uint32_t m_doorbell; std::unique_ptr m_log_bo; @@ -51,9 +52,10 @@ struct hw_ctx { hw_ctx(device &dev, const std::map &qos, std::unique_ptr q, const std::vector &pdi, - const std::string &cu_name, size_t functional = 0); + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols); hw_ctx(device &dev, const std::vector &pdi, const std::string &cu_name, + uint32_t n_rows, uint32_t n_cols, const std::map &qos = {}); ~hw_ctx(); // no copying @@ -67,7 +69,7 @@ struct hw_ctx { void create_ctx_on_device(); void init_log_buf(); void fini_log_buf() const; - void delete_ctx_on_device(); + void delete_ctx_on_device() const; hw_q *get_hw_queue() const; }; diff --git a/tests/conftest.py b/tests/conftest.py index 31c12bb7c..10b70107a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,10 @@ import numpy as np import pytest + +# TODO(max): connect this (or something) to xrt_lite_n_core_rows and xrt_lite_n_core_cols +from iree._runtime_libs._runtime import parse_flags + from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry from iree.compiler.api import Session, Output, Source, _initializeGlobalCL From 1a3efc4682210648052538a0e53e93ffdc1e22eb Mon Sep 17 00:00:00 2001 From: makslevental Date: Thu, 17 Oct 2024 20:34:58 -0400 Subject: [PATCH 28/35] incorporate comments --- build_tools/ci/run_matmul_test.sh | 4 ++-- runtime/src/iree-amd-aie/driver/xrt-lite/device.cc | 2 +- runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc | 2 +- runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h | 2 +- .../iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp | 6 ++++-- .../iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h | 6 ++++-- .../driver/xrt-lite/shim/linux/kmq/shim_debug.cpp | 6 ++++-- 7 files changed, 17 insertions(+), 11 deletions(-) diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 6a6146487..f4954a0d5 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -464,10 +464,10 @@ function run_matmul_test() { --device=${DEVICE_HAL} \ --max_elements_to_check=${max_elements_to_check}" - if [ -n "$XRT_LITE_N_CORE_ROWS" ]; then + if [ -n "${XRT_LITE_N_CORE_ROWS:-}" ]; then COMMAND="${COMMAND} --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS" fi - if [ -n "$XRT_LITE_N_CORE_COLS" ]; then + if [ -n "${XRT_LITE_N_CORE_COLS:-}" ]; then COMMAND="${COMMAND} --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS" fi diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index c9d3bb64e..323bd4aaa 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -66,7 +66,7 @@ static iree_status_t iree_hal_xrt_lite_device_create_command_buffer( if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_UNIMPLEMENTED, - "unimplmented multi-shot command buffer"); + "unimplemented multi-shot command buffer"); } iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc index e6a29a17f..aedd01453 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc @@ -1,4 +1,4 @@ -// Copyright 2021 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h index f7c5615e9..01931b9a1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h @@ -1,4 +1,4 @@ -// Copyright 2021 The IREE Authors +// Copyright 2024 The IREE Authors // // Licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp index f20b32dc5..a142e281b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp @@ -1,6 +1,8 @@ +// Copyright 2024 The IREE Authors // -// Created by mlevental on 10/11/24. -// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "kernel.h" diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h index 2993a8465..ddc7a9283 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h @@ -1,6 +1,8 @@ +// Copyright 2024 The IREE Authors // -// Created by mlevental on 10/11/24. -// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #ifndef KERNEL_H #define KERNEL_H diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp index a41e16193..75b14fdfc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp @@ -1,6 +1,8 @@ +// Copyright 2024 The IREE Authors // -// Created by mlevental on 10/3/24. -// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "shim_debug.h" From 514db1db4b6a454cabe98ef7a4fd732cad810e5d Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 17 Oct 2024 22:09:46 -0400 Subject: [PATCH 29/35] make xrt_lite_n_core_rows, xrt_lite_n_core_cols required --- .github/workflows/ci-linux.yml | 4 +- .../xrt-lite/registration/driver_module.c | 45 ++++++++++++------- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 2 +- tests/conftest.py | 25 ++++++++--- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 9e3e25508..4642a3856 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -201,7 +201,9 @@ jobs: pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie + --peano-install-dir=$PWD/llvm-aie \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - name: XRT-LITE tests run: | diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c index 45617cbe3..72f4841b1 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -10,9 +10,9 @@ #include "iree/base/api.h" #include "iree/base/internal/flags.h" -IREE_FLAG(int32_t, xrt_lite_n_core_rows, 4, +IREE_FLAG(int32_t, xrt_lite_n_core_rows, 0, "Number of core rows to use on NPU."); -IREE_FLAG(int32_t, xrt_lite_n_core_cols, 4, +IREE_FLAG(int32_t, xrt_lite_n_core_cols, 0, "Number of core cols to use on NPU."); static const iree_string_view_t key_xrt_lite_n_core_rows = @@ -71,6 +71,13 @@ static iree_status_t iree_hal_xrt_lite_driver_populate_options( "Option 'key_xrt_lite_n_core_rows' expected to be int. Got: '%.*s'", (int)value.size, value.data); } + if (ivalue <= 0) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_rows' expected to be > 0. Got: '%.*s'", + (int)value.size, value.data); + } device_params->n_core_rows = ivalue; } else if (iree_string_view_equal(key, key_xrt_lite_n_core_cols)) { if (!iree_string_view_atoi_int32(value, &ivalue)) { @@ -80,6 +87,13 @@ static iree_status_t iree_hal_xrt_lite_driver_populate_options( "Option 'key_xrt_lite_n_core_cols' expected to be int. Got: '%.*s'", (int)value.size, value.data); } + if (ivalue <= 0) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_cols' expected to be > 0. Got: '%.*s'", + (int)value.size, value.data); + } device_params->n_core_cols = ivalue; } else { IREE_TRACE_ZONE_END(z0); @@ -112,25 +126,22 @@ static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( iree_string_pair_builder_t flag_option_builder; iree_string_pair_builder_initialize(host_allocator, &flag_option_builder); - iree_status_t status = - iree_hal_xrt_lite_driver_parse_flags(&flag_option_builder); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_parse_flags(&flag_option_builder)); - if (iree_status_is_ok(status)) { - IREE_TRACE_ZONE_END(z0); - status = iree_hal_xrt_lite_driver_populate_options( - host_allocator, &driver_options, &device_params, - iree_string_pair_builder_size(&flag_option_builder), - iree_string_pair_builder_pairs(&flag_option_builder)); - } else { - IREE_TRACE_ZONE_END(z0); - return status; - } + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_populate_options( + host_allocator, &driver_options, &device_params, + iree_string_pair_builder_size(&flag_option_builder), + iree_string_pair_builder_pairs(&flag_option_builder))); - status = iree_hal_xrt_lite_driver_create( - driver_name, &driver_options, &device_params, host_allocator, out_driver); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_create(driver_name, &driver_options, + &device_params, host_allocator, + out_driver)); IREE_TRACE_ZONE_END(z0); - return status; + return iree_ok_status(); } IREE_API_EXPORT iree_status_t diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index 6a591e9ec..8b71d5f38 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -143,7 +143,7 @@ void *pdev::mmap(void *addr, size_t len, int prot, int flags, device::device(uint32_t n_rows, uint32_t n_cols) : n_rows(n_rows), n_cols(n_cols) { - SHIM_DEBUG("Created KMQ device"); + SHIM_DEBUG("Created KMQ device n_rows %d n_cols %d", n_rows, n_cols); } device::~device() { SHIM_DEBUG("Destroying KMQ device"); } diff --git a/tests/conftest.py b/tests/conftest.py index 10b70107a..69d06af45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,8 +4,8 @@ import numpy as np import pytest -# TODO(max): connect this (or something) to xrt_lite_n_core_rows and xrt_lite_n_core_cols from iree._runtime_libs._runtime import parse_flags +from ml_dtypes import bfloat16 from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry @@ -13,7 +13,6 @@ from iree.compiler.extras import types as T from iree.runtime import VmModule from iree.runtime import get_driver, Config, SystemContext -from ml_dtypes import bfloat16 for t in [ "i8", @@ -57,14 +56,28 @@ def pytest_addoption(parser): nargs="?", choices=["xrt", "xrt-lite"], ) + parser.addoption("--xrt_lite_n_core_rows", type=int) + parser.addoption("--xrt_lite_n_core_cols", type=int) @pytest.fixture(scope="session") -def global_cl_args(request): - _initializeGlobalCL( +def global_cl_args(request, pytestconfig): + compiler_flags = [ "--iree-hal-memoization=false", "--iree-hal-indirect-command-buffers=false", - ) + ] + _initializeGlobalCL(*compiler_flags) + + runtime_flags = [] + if pytestconfig.option.xrt_lite_n_core_rows is not None: + runtime_flags += [ + f"--xrt_lite_n_core_rows={pytestconfig.option.xrt_lite_n_core_rows}" + ] + if pytestconfig.option.xrt_lite_n_core_cols is not None: + runtime_flags += [ + f"--xrt_lite_n_core_cols={pytestconfig.option.xrt_lite_n_core_cols}" + ] + parse_flags(*runtime_flags) @pytest.fixture @@ -117,7 +130,7 @@ def session_module(iree_session) -> ir.Module: @pytest.fixture(scope="session") -def device(pytestconfig) -> ir.Module: +def device(pytestconfig, global_cl_args) -> ir.Module: yield get_driver(pytestconfig.option.device_hal).create_default_device() From a224bcc11fb662c1104dbc914964abc7acf6cfdb Mon Sep 17 00:00:00 2001 From: makslevental Date: Fri, 18 Oct 2024 10:34:34 -0400 Subject: [PATCH 30/35] really make xrt-lite default and test building in "cleanroom" --- .github/workflows/ci-linux-cleanroom.yml | 216 +++++++++++++++++++++++ .github/workflows/ci-linux.yml | 6 - .github/workflows/ci-macos.yml | 5 +- README.md | 84 ++------- iree_compiler_plugin.cmake | 6 - iree_runtime_plugin.cmake | 11 +- runtime/src/iree-amd-aie/CMakeLists.txt | 10 +- 7 files changed, 241 insertions(+), 97 deletions(-) create mode 100644 .github/workflows/ci-linux-cleanroom.yml diff --git a/.github/workflows/ci-linux-cleanroom.yml b/.github/workflows/ci-linux-cleanroom.yml new file mode 100644 index 000000000..f7a46a95d --- /dev/null +++ b/.github/workflows/ci-linux-cleanroom.yml @@ -0,0 +1,216 @@ +name: CI Ubuntu + +on: + workflow_call: + workflow_dispatch: + inputs: + force_debug_with_tmate: + type: boolean + description: 'Run the build with tmate session' + required: false + default: false + debug_with_tmate: + type: boolean + description: 'Run the build with a tmate session ONLY in case of failure' + required: false + default: false + pull_request: + merge_group: + push: + branches: + - main + +concurrency: + group: ci-build-test-cpp-ubuntu-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + build_and_ctest: + name: Build and Test (ubuntu, ASSERTIONS) + runs-on: ubuntu-22.04 + strategy: + fail-fast: true + env: + CACHE_DIR: ${{ github.workspace }}/.container-cache + CACHE_KEY: ubuntu-build-test-cpp-asserts-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }} + steps: + - name: Set unified TZ + uses: szenius/set-timezone@v2.0 + with: + timezoneLinux: "Asia/Singapore" + timezoneMacos: "Asia/Singapore" + timezoneWindows: "Singapore Standard Time" + + - name: Checking out repository + env: + BRANCH_NAME: ${{ github.ref }} + REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }} + run: | + git init + git remote add origin $REPO_ADDRESS + git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME + git reset --hard FETCH_HEAD + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."third_party/XRT".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 + + - name: System deps + run: | + sudo apt install ccache ninja-build + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Python deps + run: | + pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind + + - name: Enable cache + uses: actions/cache/restore@v3 + with: + path: ${{ env.CACHE_DIR }} + key: ${{ env.CACHE_KEY }} + restore-keys: ubuntu-build-test-cpp-asserts- + + - name: Peano dep + run: | + bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + + - name: Build packages + run: | + export cache_dir="${{ env.CACHE_DIR }}" + export CCACHE_COMPILERCHECK="string:$(clang --version)" + bash build_tools/build_llvm.sh + rm -rf llvm-build + export llvm_install_dir=$PWD/llvm-install + bash build_tools/build_test_cpp.sh + + - name: Create artifacts + if: ${{ !cancelled() }} + run: | + pushd third_party/iree/third_party/llvm-project && llvm_sha_short=$(git rev-parse --short HEAD) && popd + tar cf llvm-dist-ubuntu-$llvm_sha_short.tar llvm-install + tar cf iree-dist-ubuntu.tar iree-install + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: ubuntu_x86_64_llvm_packages + path: llvm-dist-*.tar + if-no-files-found: warn + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: ubuntu_x86_64_iree_packages + path: iree-dist-ubuntu.tar + if-no-files-found: warn + + - name: Save cache + uses: actions/cache/save@v3 + if: ${{ !cancelled() && github.event_name == 'push' && github.ref_name == 'main' }} + with: + path: ${{ env.CACHE_DIR }} + key: ${{ env.CACHE_KEY }} + + - name: Start tmate session + if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }} + uses: mxschmitt/action-tmate@v3.18 + with: + limit-access-to-actor: true + + test_linux: + name: E2E Test linux + needs: build_and_ctest + strategy: + fail-fast: false + matrix: + runs-on: [linux-phoenix] + runs-on: ${{ matrix.runs-on }} + env: + XILINXD_LICENSE_FILE: /opt/xilinx/Xilinx.lic + steps: + - name: "Checking out repository" # for test scripts + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + with: + submodules: false # not required for testbench + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: ubuntu_x86_64_iree_packages + + - name: Extract artifact + run: | + tar -xvf iree-dist-ubuntu.tar + echo "IREE_INSTALL_DIR=$PWD/iree-install" >> $GITHUB_ENV + echo "PYTHONPATH=$PWD/iree-install/python_packages/iree_compiler:$PWD/iree-install/python_packages/iree_runtime" >> $GITHUB_ENV + bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Create venv and install dependencies + run: | + python -m venv .venv + source .venv/bin/activate + pip install -r tests/requirements.txt + + - name: Query device info + run: | + source .venv/bin/activate + echo "aie-metadata" + python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-metadata + echo "aie-version" + python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-version + echo "XRT_LITE_N_CORE_ROWS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows)" >> $GITHUB_ENV + echo "XRT_LITE_N_CORE_COLS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols)" >> $GITHUB_ENV + + - name : E2E comparison of AIE to llvm-cpu + run: | + source .venv/bin/activate + python build_tools/ci/cpu_comparison/run.py \ + test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie \ + --vitis-dir /opt/Xilinx/Vitis/2024.2 \ + --reset-npu-between-runs -v \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS + + - name: E2E correctness matmul test + run: | + # https://stackoverflow.com/a/17567422 + # shim_xdna::bo::map_drm_bo does an mmap with MAP_LOCKED + # which can fail if limit is to low + sudo prlimit -lunlimited --pid $$ + source .venv/bin/activate + bash build_tools/ci/run_matmul_test.sh \ + test_matmuls \ + iree-install \ + $PWD/llvm-aie \ + /opt/Xilinx/Vitis/2024.2 + + - name: Python tests + run: | + source .venv/bin/activate + pytest -v tests \ + --capture=tee-sys \ + --iree-install-dir=$PWD/iree-install \ + --peano-install-dir=$PWD/llvm-aie \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS + + - name: XRT-LITE tests + run: | + DEVICE_TEST_DIR="$PWD/iree-install/device_tests" + for t in $(ls $DEVICE_TEST_DIR); do + $DEVICE_TEST_DIR/$t --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS + done diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 4642a3856..cdc8705db 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -53,12 +53,6 @@ jobs: -c submodule."third_party/stablehlo".update=none \ -c submodule."third_party/XRT".update=none \ submodule update --init --recursive --depth 1 --single-branch -j 10 - - - name: Install deps - run: | - dnf install -y almalinux-release-devel epel-release - yum remove -y openssl-devel zlib-devel || true - yum install -y protobuf-devel protobuf-compiler tmate - name: Python deps run: | diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml index 9d273d462..4871e9745 100644 --- a/.github/workflows/ci-macos.yml +++ b/.github/workflows/ci-macos.yml @@ -62,7 +62,10 @@ jobs: git remote add origin $REPO_ADDRESS git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."third_party/XRT".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 - uses: actions/setup-python@v4 with: diff --git a/README.md b/README.md index d42c62e19..03289fdef 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,6 @@ This repository contains an early-phase IREE compiler and runtime plugin for interfacing the AMD AIE accelerator to IREE. -## Architectural Overview - -![image](https://github.com/nod-ai/iree-amd-aie/assets/74956/3fa73139-5fdf-4658-86c3-0705352c4ea0) - - ## Developer Setup **Strong recommendation**: check the CI scripts @ [.github/workflows](.github/workflows) - they do a fresh checkout and build on every commit and are written to be read by a non-CI expert. @@ -33,7 +28,7 @@ or if you want a faster checkout git \ -c submodule."third_party/torch-mlir".update=none \ -c submodule."third_party/stablehlo".update=none \ - -c submodule."src/runtime_src/core/common/aiebu".update=none \ + -c submodule."third_party/XRT".update=none \ clone \ --recursive \ --shallow-submodules \ @@ -46,12 +41,10 @@ The above avoids cloning entire repo histories, and skips unused nested submodul ### Just show me the CMake -To configure and build with XRT runtime enabled - ``` cd iree-amd-aie cmake \ - -B $WHERE_YOU_WOULD_LIKE_TO_BUILD \ + -B \ -S third_party/iree \ -DIREE_CMAKE_PLUGIN_PATHS=$PWD \ -DIREE_BUILD_PYTHON_BINDINGS=ON \ @@ -62,9 +55,9 @@ cmake \ -DIREE_TARGET_BACKEND_DEFAULTS=OFF \ -DIREE_TARGET_BACKEND_LLVM_CPU=ON \ -DIREE_BUILD_TESTS=ON \ - -DIREE_EXTERNAL_HAL_DRIVERS=xrt \ - -DCMAKE_INSTALL_PREFIX=$WHERE_YOU_WOULD_LIKE_TO_INSTALL -cmake --build $WHERE_YOU_WOULD_LIKE_TO_BUILD + -DIREE_EXTERNAL_HAL_DRIVERS=xrt-lite \ + -DCMAKE_INSTALL_PREFIX= +cmake --build ``` ### Instructions @@ -73,9 +66,9 @@ The bare minimum configure command for IREE with the amd-aie plugin ``` cmake \ - -B $WHERE_YOU_WOULD_LIKE_TO_BUILD \ - -S $IREE_REPO_SRC_DIR \ - -DIREE_CMAKE_PLUGIN_PATHS=$IREE_AMD_AIE_REPO_SRC_DIR \ + -B \ + -S \ + -DIREE_CMAKE_PLUGIN_PATHS= \ -DIREE_BUILD_PYTHON_BINDINGS=ON ``` @@ -111,7 +104,7 @@ If you're "bringing your own LLVM", i.e., you have a prebuilt/compiled distribut -DIREE_BUILD_BUNDLED_LLVM=OFF ``` -In this case you will need to supply `-DLLVM_EXTERNAL_LIT=$SOMEWHERE` (e.g., `pip install lit; SOMEWHERE=$(which lit)`). +In this case you will need to supply `-DLLVM_EXTERNAL_LIT=` (e.g., `pip install lit; SOMEWHERE=$(which lit)`). Note, getting the right/matching build of LLVM, that works with IREE is tough (besides the commit hash, there are various flags to set). To enable adventurous users to avail themselves of `-DIREE_BUILD_BUNDLED_LLVM=OFF` we cache/store/save the LLVM distribution for every successful CI run. @@ -121,65 +114,18 @@ These can then be downloaded by checking the artifacts section of any recent CI

+## Testing + Lit tests specific to AIE can be run with something like ``` -cd $WHERE_YOU_WOULD_LIKE_TO_BUILD +cd ctest -R amd-aie ``` -Other tests which run on hardware and requiring XRT are in the `build_tools` subdirectory. - -## Runtime driver setup +Other tests, which run on device, are in the `build_tools` subdirectory. -To enable the runtime driver, you need to also enable the XRT HAL +## Architectural overview (out of date) -``` - -DIREE_EXTERNAL_HAL_DRIVERS=xrt -``` - -Additional IREE-specific flags are explained at [IREE's build instructions](https://iree.dev/building-from-source/getting-started/#quickstart-clone-and-build). To use Ninja instead of Make, and clang++ instead of g++, you can add - - -``` - -G Ninja \ - -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_C_COMPILER=clang -``` - - -### Ubuntu Dependencies - -XRT requires a number of packages. Here are the requirements for various operating systems - -``` -apt install \ - libcurl4-openssl-dev \ - libdrm-dev \ - libelf-dev \ - libprotobuf-dev \ - libudev-dev \ - pkg-config \ - protobuf-compiler \ - python3-pybind11 \ - systemtap-sdt-dev \ - uuid-dev -``` - -### RH Based Deps - -This is an incomplete list derived by adding what is needed to our development base manylinux (AlmaLinux 8) image. +![image](https://github.com/nod-ai/iree-amd-aie/assets/74956/3fa73139-5fdf-4658-86c3-0705352c4ea0) -``` -yum install \ - libcurl-devel \ - libdrm-devel \ - libudev-devel \ - libuuid-devel \ - ncurses-devel \ - pkgconfig \ - protobuf-compiler \ - protobuf-devel \ - systemtap-sdt-devel \ - uuid-devel -``` diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index a707091ca..958d6de46 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -17,12 +17,6 @@ if("xrt" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) set(IREE_AMD_AIE_ENABLE_XRT_DRIVER ON) endif() -set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER OFF) -if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) - message(STATUS "Enabling XRT-LITE build because it is an enabled HAL driver") - set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) -endif() - if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) include(iree_aie_xrt) endif() diff --git a/iree_runtime_plugin.cmake b/iree_runtime_plugin.cmake index 0bc5637b5..d8138465e 100644 --- a/iree_runtime_plugin.cmake +++ b/iree_runtime_plugin.cmake @@ -21,19 +21,10 @@ if("xrt" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) set(IREE_AMD_AIE_ENABLE_XRT_DRIVER ON) endif() -set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER OFF) -if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) - message(STATUS "Enabling XRT-LITE build because it is an enabled HAL driver") - set(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER ON) -endif() - if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) include(iree_aie_xrt) endif() - -if(IREE_AMD_AIE_ENABLE_XRT_DRIVER OR IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) - include(iree_aie_bootgen) -endif() +include(iree_aie_bootgen) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/runtime/src AMD-AIE) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/experimental AMD-AIE-experimental) diff --git a/runtime/src/iree-amd-aie/CMakeLists.txt b/runtime/src/iree-amd-aie/CMakeLists.txt index d861c846d..8b67676cd 100644 --- a/runtime/src/iree-amd-aie/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/CMakeLists.txt @@ -5,17 +5,17 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) - add_subdirectory(driver/xrt) + add_subdirectory(driver/xrt) endif() -if(IREE_AMD_AIE_ENABLE_XRT_LITE_DRIVER) - add_subdirectory(driver/xrt-lite) +if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) + add_subdirectory(driver/xrt-lite) endif() -# Flatbuffer schema generation does not require XRT. Moreover the generated +# Flatbuffer schema generation does not require a driver but the generated # flatbuffer header files are used by the compiler to create artefacts # (.vmfb file), and so the schema sub-directory is required even when not -# building the XRT driver code. +# building driver code. add_subdirectory(schemas) # Contains libiree_aie_runtime, i.e., suitably encapsulated calls to aie-rt. From db3cc6b3824cf8209532a4f8fcc95d42d4012815 Mon Sep 17 00:00:00 2001 From: makslevental Date: Fri, 18 Oct 2024 13:00:34 -0400 Subject: [PATCH 31/35] add iree-benchmark-module test --- README.md | 64 +++++++++++++++---- build_tools/ci/run_matmul_test.sh | 16 ++++- .../iree-amd-aie/driver/xrt-lite/driver.cc | 1 + 3 files changed, 65 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 03289fdef..18c7cbe92 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # AMD AIE Plugin for IREE -This repository contains an early-phase IREE compiler and runtime plugin for interfacing the AMD AIE accelerator to IREE. +This repository contains an early-phase IREE compiler and runtime plugin for targeting AMD NPUs with IREE. ## Developer Setup @@ -21,8 +21,7 @@ git clone --recursive git@github.com:nod-ai/iree-amd-aie.git git clone --recursive https://github.com/nod-ai/iree-amd-aie.git ``` -or if you want a faster checkout - +or, if you want a faster checkout, ``` git \ @@ -32,10 +31,11 @@ git \ clone \ --recursive \ --shallow-submodules \ - https://github.com/nod-ai/iree-amd-aie.git + git@github.com:nod-ai/iree-amd-aie.git # https://github.com/nod-ai/iree-amd-aie.git ``` -The above avoids cloning entire repo histories, and skips unused nested submodules. +The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused, +submodules that are nested in IREE. ## Building (along with IREE) @@ -62,7 +62,7 @@ cmake --build ### Instructions -The bare minimum configure command for IREE with the amd-aie plugin +The bare minimum configure command for IREE with the amd-aie plugin ``` cmake \ @@ -81,7 +81,8 @@ Very likely, you will want to use `ccache` and `lld` (or some other modern linke -DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=lld" ``` -If you don't plan on using any of IREE's frontends or backends/targets (e.g., you're doing work on this code base itself), you can opt-out of everything (except the `llvm-cpu` backend) with +If you don't plan on using any of IREE's frontends or backends/targets (e.g., you're doing work on this code base itself), +you can opt-out of everything (except the `llvm-cpu` backend) with ``` -DIREE_INPUT_STABLEHLO=OFF \ @@ -104,9 +105,29 @@ If you're "bringing your own LLVM", i.e., you have a prebuilt/compiled distribut -DIREE_BUILD_BUNDLED_LLVM=OFF ``` -In this case you will need to supply `-DLLVM_EXTERNAL_LIT=` (e.g., `pip install lit; SOMEWHERE=$(which lit)`). +In this case you will need `lit` somewhere in your environment and you will need to add to CMake `-DLLVM_EXTERNAL_LIT=` +(e.g., `pip install lit; SOMEWHERE=$(which lit)`). + +See [Bringing your own LLVM](#bringing-your-own-llvm) below for more information on using prebuilt/compiled distributions of LLVM. + +## Testing + +Lit tests (i.e., compiler tests) specific to AIE can be run with something like + +``` +cd +ctest -R amd-aie --output-on-failure -j 10 +``` + +(the `-j 10` runs `10` tests in parallel) + +Other tests, which run on device, are in the `build_tools` subdirectory. -Note, getting the right/matching build of LLVM, that works with IREE is tough (besides the commit hash, there are various flags to set). +## Pro-tips + +### Bringing your own LLVM + +When using a pre-built distribution of LLVM, getting the right/matching build, that works with IREE, is tough (besides the commit hash, there are various flags to set). To enable adventurous users to avail themselves of `-DIREE_BUILD_BUNDLED_LLVM=OFF` we cache/store/save the LLVM distribution for every successful CI run. These can then be downloaded by checking the artifacts section of any recent CI run's [Summary page](https://github.com/nod-ai/iree-amd-aie/actions/runs/10713474448): @@ -114,16 +135,31 @@ These can then be downloaded by checking the artifacts section of any recent CI

-## Testing -Lit tests specific to AIE can be run with something like +### Debugging HAL + +You can turn on HAL API tracing by adding to CMake: ``` -cd -ctest -R amd-aie +-DIREE_ENABLE_RUNTIME_TRACING=ON +-DIREE_TRACING_PROVIDER=console +// optional but recommended +-DIREE_TRACING_CONSOLE_FLUSH=1 ``` -Other tests, which run on device, are in the `build_tools` subdirectory. +This will you show you all the HAL APIs that have `IREE_TRACE_ZONE_BEGIN ... IREE_TRACE_ZONE_END` that are hit during a run/execution (of, e.g., `iree-run-module`). + +You can turn on VM tracing by adding to CMake: + +``` +-DIREE_VM_EXECUTION_TRACING_ENABLE=1 +-DIREE_VM_EXECUTION_TRACING_FORCE_ENABLE=1 +// optional +-DIREE_VM_EXECUTION_TRACING_SRC_LOC_ENABLE=1 +``` + +This will show you all of the [VM dispatches](https://github.com/iree-org/iree/blob/0e8a5737dfe49a48a4e9c15ba7a7d24dd2fd7623/runtime/src/iree/vm/bytecode/dispatch.c#L661) that actually occur during a run/execution. +Note, this is roughly equivalent to [passing](https://github.com/nod-ai/iree-amd-aie/blob/737092791dc2428ad71bc172f69804c583b0f60e/build_tools/ci/run_matmul_test.sh#L420) `--compile-to=vm` to `iree-compile`. ## Architectural overview (out of date) diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index f4954a0d5..1ed121c44 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -441,7 +441,7 @@ function run_matmul_test() { fi fi - # Renable exit on failure: + # Re-enable exit on failure: echo "**** Generating calls .vmfb file for ${name} ****" ${IREE_COMPILE_EXE} "${calls_ir}" \ --iree-hal-target-backends=${target_backend} \ @@ -793,7 +793,19 @@ if [ -d "$VITIS" ]; then fi -echo "\n\n" +# note this will not actually show any devices because --xrt_lite_n_core_rows --xrt_lite_n_core_cols are not passed +# which i have omitted to make the conditional slightly more succinct +if [[ $($IREE_INSTALL_DIR/bin/iree-benchmark-module --dump_devices | grep xrt-lite) ]]; then + $IREE_INSTALL_DIR/bin/iree-benchmark-module \ + --module=$OUTPUT_DIR/mm_test1_bf16_f32_m64_n64_k64.vmfb \ + --function=matmul_64x64_64xbf16_ \ + --input=64x64xbf16 \ + --input=64x64xbf16 \ + --device=xrt-lite \ + --benchmark_repetitions=10 \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS +fi echo "$MATMUL_TESTS_RUN matmul tests run!" if [ $MATMUL_TESTS_FAILS -ne 0 ]; then diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index 87a7b9c1f..3dbba529f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -127,6 +127,7 @@ namespace { const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable = { .destroy = iree_hal_xrt_lite_driver_destroy, .query_available_devices = iree_hal_xrt_lite_driver_query_available_devices, + .dump_device_info = unimplemented_ok_status, .create_device_by_id = iree_hal_xrt_lite_driver_create_device_by_id, .create_device_by_path = iree_hal_xrt_lite_driver_create_device_by_path, }; From 8974d20ba809b5bc9d0245c8f1d602438a88fb93 Mon Sep 17 00:00:00 2001 From: makslevental Date: Fri, 18 Oct 2024 15:49:57 -0400 Subject: [PATCH 32/35] remove WERROR hack and add `run_all_runtime_tests.sh` --- .github/workflows/ci-linux.yml | 5 --- README.md | 1 + build_tools/build_test_cpp.ps1 | 2 - build_tools/build_test_cpp.sh | 2 - build_tools/ci/run_all_runtime_tests.sh | 52 +++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 9 deletions(-) create mode 100755 build_tools/ci/run_all_runtime_tests.sh diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index cdc8705db..756d622db 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -59,11 +59,6 @@ jobs: pip install -r third_party/iree/runtime/bindings/python/iree/runtime/build_requirements.txt pip install pyyaml - - name: Peano dep - run: | - bash build_tools/download_peano.sh - echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV - - name: Enable cache uses: actions/cache/restore@v3 with: diff --git a/README.md b/README.md index 18c7cbe92..c6bbe5c1f 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ ctest -R amd-aie --output-on-failure -j 10 (the `-j 10` runs `10` tests in parallel) Other tests, which run on device, are in the `build_tools` subdirectory. +See [build_tools/ci/run_all_runtime_tests.sh](build_tools/ci/run_all_runtime_tests.sh) for an example script that shows how to run all the runtime tests. ## Pro-tips diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index 4a579cbd6..e686f40cf 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -82,8 +82,6 @@ $CMAKE_ARGS = @( "-DIREE_CMAKE_PLUGIN_PATHS=$repo_root" "-DIREE_EXTERNAL_HAL_DRIVERS=xrt" "-DIREE_BUILD_PYTHON_BINDINGS=ON" - # iree/runtime/src/iree/hal/cts/cts_test_base.h:173:24: error: unused variable 'device_buffer' [-Werror,-Wunused-variable] - "-DIREE_ENABLE_WERROR_FLAG=OFF" ) $peano_install_dir = "$env:PEANO_INSTALL_DIR" diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index e45e3cdf4..a1d610f64 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -81,8 +81,6 @@ CMAKE_ARGS=( -DIREE_INPUT_TORCH=OFF -DCMAKE_OBJECT_PATH_MAX=4096 -DIREE_CMAKE_PLUGIN_PATHS="$repo_root" - # iree/runtime/src/iree/hal/cts/cts_test_base.h:173:24: error: unused variable 'device_buffer' [-Werror,-Wunused-variable] - -DIREE_ENABLE_WERROR_FLAG=OFF ) PEANO_INSTALL_DIR=${PEANO_INSTALL_DIR:-""} diff --git a/build_tools/ci/run_all_runtime_tests.sh b/build_tools/ci/run_all_runtime_tests.sh new file mode 100755 index 000000000..1d439b93f --- /dev/null +++ b/build_tools/ci/run_all_runtime_tests.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -eu + +this_dir="$(cd $(dirname $0) && pwd)" +src_dir="$(cd $this_dir/../.. && pwd)" + +if [ -z "${IREE_INSTALL_DIR}" ]; then + echo "IREE_INSTALL_DIR needs to be set" + exit 1 +fi + +if [ -z "${PEANO_INSTALL_DIR}" ]; then + echo "PEANO_INSTALL_DIR needs to be set" + exit 1 +fi + +if [ -z "${VITIS_DIR}" ]; then + echo "VITIS_DIR needs to be set" + exit 1 +fi + +if [ -z "${XILINXD_LICENSE_FILE}" ]; then + echo "XILINXD_LICENSE_FILE needs to be set" + exit 1 +fi + +export PYTHONPATH=$IREE_INSTALL_DIR/python_packages/iree_compiler:$IREE_INSTALL_DIR/python_packages/iree_runtime +export XRT_LITE_N_CORE_ROWS=$(python $this_dir/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows) +export XRT_LITE_N_CORE_COLS=$(python $this_dir/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols) +export PATH=$IREE_INSTALL_DIR/bin:$PATH + +$this_dir/cpu_comparison/run.py \ + $this_dir/test_aie_vs_cpu \ + $IREE_INSTALL_DIR \ + $PEANO_INSTALL_DIR \ + --vitis-dir $VITIS_DIR \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS \ + -v + +$this_dir/run_matmul_test.sh \ + $this_dir/test_matmuls \ + $IREE_INSTALL_DIR \ + $PEANO_INSTALL_DIR \ + $VITIS_DIR + +pytest -rv --capture=tee-sys $src_dir/tests \ + --peano-install-dir=$PEANO_INSTALL_DIR \ + --iree-install-dir=$IREE_INSTALL_DIR \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS From 05ddaec77557c1ce2f984b53e42627bd6880858c Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Fri, 18 Oct 2024 16:07:17 -0700 Subject: [PATCH 33/35] Delete .github/workflows/ci-linux-cleanroom.yml --- .github/workflows/ci-linux-cleanroom.yml | 216 ----------------------- 1 file changed, 216 deletions(-) delete mode 100644 .github/workflows/ci-linux-cleanroom.yml diff --git a/.github/workflows/ci-linux-cleanroom.yml b/.github/workflows/ci-linux-cleanroom.yml deleted file mode 100644 index f7a46a95d..000000000 --- a/.github/workflows/ci-linux-cleanroom.yml +++ /dev/null @@ -1,216 +0,0 @@ -name: CI Ubuntu - -on: - workflow_call: - workflow_dispatch: - inputs: - force_debug_with_tmate: - type: boolean - description: 'Run the build with tmate session' - required: false - default: false - debug_with_tmate: - type: boolean - description: 'Run the build with a tmate session ONLY in case of failure' - required: false - default: false - pull_request: - merge_group: - push: - branches: - - main - -concurrency: - group: ci-build-test-cpp-ubuntu-${{ github.event.number || github.sha }} - cancel-in-progress: true - -jobs: - build_and_ctest: - name: Build and Test (ubuntu, ASSERTIONS) - runs-on: ubuntu-22.04 - strategy: - fail-fast: true - env: - CACHE_DIR: ${{ github.workspace }}/.container-cache - CACHE_KEY: ubuntu-build-test-cpp-asserts-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }} - steps: - - name: Set unified TZ - uses: szenius/set-timezone@v2.0 - with: - timezoneLinux: "Asia/Singapore" - timezoneMacos: "Asia/Singapore" - timezoneWindows: "Singapore Standard Time" - - - name: Checking out repository - env: - BRANCH_NAME: ${{ github.ref }} - REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }} - run: | - git init - git remote add origin $REPO_ADDRESS - git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME - git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none \ - -c submodule."third_party/stablehlo".update=none \ - -c submodule."third_party/XRT".update=none \ - submodule update --init --recursive --depth 1 --single-branch -j 10 - - - name: System deps - run: | - sudo apt install ccache ninja-build - - - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Python deps - run: | - pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind - - - name: Enable cache - uses: actions/cache/restore@v3 - with: - path: ${{ env.CACHE_DIR }} - key: ${{ env.CACHE_KEY }} - restore-keys: ubuntu-build-test-cpp-asserts- - - - name: Peano dep - run: | - bash build_tools/download_peano.sh - echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV - - - name: Build packages - run: | - export cache_dir="${{ env.CACHE_DIR }}" - export CCACHE_COMPILERCHECK="string:$(clang --version)" - bash build_tools/build_llvm.sh - rm -rf llvm-build - export llvm_install_dir=$PWD/llvm-install - bash build_tools/build_test_cpp.sh - - - name: Create artifacts - if: ${{ !cancelled() }} - run: | - pushd third_party/iree/third_party/llvm-project && llvm_sha_short=$(git rev-parse --short HEAD) && popd - tar cf llvm-dist-ubuntu-$llvm_sha_short.tar llvm-install - tar cf iree-dist-ubuntu.tar iree-install - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - if: ${{ !cancelled() }} - with: - name: ubuntu_x86_64_llvm_packages - path: llvm-dist-*.tar - if-no-files-found: warn - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - if: ${{ !cancelled() }} - with: - name: ubuntu_x86_64_iree_packages - path: iree-dist-ubuntu.tar - if-no-files-found: warn - - - name: Save cache - uses: actions/cache/save@v3 - if: ${{ !cancelled() && github.event_name == 'push' && github.ref_name == 'main' }} - with: - path: ${{ env.CACHE_DIR }} - key: ${{ env.CACHE_KEY }} - - - name: Start tmate session - if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }} - uses: mxschmitt/action-tmate@v3.18 - with: - limit-access-to-actor: true - - test_linux: - name: E2E Test linux - needs: build_and_ctest - strategy: - fail-fast: false - matrix: - runs-on: [linux-phoenix] - runs-on: ${{ matrix.runs-on }} - env: - XILINXD_LICENSE_FILE: /opt/xilinx/Xilinx.lic - steps: - - name: "Checking out repository" # for test scripts - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 - with: - submodules: false # not required for testbench - - - name: Download artifacts - uses: actions/download-artifact@v4 - with: - name: ubuntu_x86_64_iree_packages - - - name: Extract artifact - run: | - tar -xvf iree-dist-ubuntu.tar - echo "IREE_INSTALL_DIR=$PWD/iree-install" >> $GITHUB_ENV - echo "PYTHONPATH=$PWD/iree-install/python_packages/iree_compiler:$PWD/iree-install/python_packages/iree_runtime" >> $GITHUB_ENV - bash build_tools/download_peano.sh - echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV - - - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Create venv and install dependencies - run: | - python -m venv .venv - source .venv/bin/activate - pip install -r tests/requirements.txt - - - name: Query device info - run: | - source .venv/bin/activate - echo "aie-metadata" - python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-metadata - echo "aie-version" - python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-version - echo "XRT_LITE_N_CORE_ROWS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows)" >> $GITHUB_ENV - echo "XRT_LITE_N_CORE_COLS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols)" >> $GITHUB_ENV - - - name : E2E comparison of AIE to llvm-cpu - run: | - source .venv/bin/activate - python build_tools/ci/cpu_comparison/run.py \ - test_aie_vs_cpu \ - $PWD/iree-install \ - $PWD/llvm-aie \ - --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v \ - --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ - --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - - - name: E2E correctness matmul test - run: | - # https://stackoverflow.com/a/17567422 - # shim_xdna::bo::map_drm_bo does an mmap with MAP_LOCKED - # which can fail if limit is to low - sudo prlimit -lunlimited --pid $$ - source .venv/bin/activate - bash build_tools/ci/run_matmul_test.sh \ - test_matmuls \ - iree-install \ - $PWD/llvm-aie \ - /opt/Xilinx/Vitis/2024.2 - - - name: Python tests - run: | - source .venv/bin/activate - pytest -v tests \ - --capture=tee-sys \ - --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie \ - --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ - --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - - - name: XRT-LITE tests - run: | - DEVICE_TEST_DIR="$PWD/iree-install/device_tests" - for t in $(ls $DEVICE_TEST_DIR); do - $DEVICE_TEST_DIR/$t --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - done From 66db9ce675f262d055c109dcd5b6770992572677 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Fri, 18 Oct 2024 16:12:29 -0700 Subject: [PATCH 34/35] Update compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h --- compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h index 91007c988..352c86a03 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h @@ -55,7 +55,7 @@ struct AMDAIEOptions { bool enablePacketFlow{false}; enum class DeviceHAL { XRT, XRT_LITE }; - DeviceHAL deviceHal{DeviceHAL::XRT}; + DeviceHAL deviceHal{DeviceHAL::XRT_LITE}; void bindOptions(OptionsBinder &binder) { static llvm::cl::OptionCategory category("AMD AIE Options"); From 93a7ba5f20c7a8b9b4a3de11f10de1aba512c539 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Fri, 18 Oct 2024 16:18:27 -0700 Subject: [PATCH 35/35] Apply suggestions from code review --- .github/CODEOWNERS | 3 +-- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 16 +++++++++++++--- .../Target/test/amd_aie_target_backend.mlir | 4 ++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 61e98b3de..138afc058 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,8 +15,7 @@ /compiler/ @MaheshRavishankar @nirvedhmeshram @yzhang93 @Abhishek-Varma @jtuyls # Runtime -/runtime/ @nirvedhmeshram -/runtime/src/iree-amd-aie/aie_runtime @makslevental +/runtime/ @makslevental # AIE Passes /compiler/plugins/target/AMD-AIE/aie @makslevental diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index acb0de8f5..d1c43f1ce 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -397,18 +397,28 @@ LogicalResult AIETargetBackend::serializeExecutable( if (auto err = llvm::sys::fs::create_directories(entryPointWorkDir)) { return moduleOp.emitOpError() - << "failed to create working directory for pdi generation: " + << "failed to create working directory for artifact generation: " << err.message(); } llvm::outs().flush(); SmallString<128> artifactPath(entryPointWorkDir); - llvm::sys::path::append(artifactPath, entryPointNamesFb[ordinal] + ".pdi"); + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + llvm::sys::path::append(artifactPath, entryPointNamesFb[ordinal] + ".xclbin"); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + llvm::sys::path::append(artifactPath, entryPointNamesFb[ordinal] + ".pdi"); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } SmallString<128> npuInstPath(entryPointWorkDir); llvm::sys::path::append(npuInstPath, entryPointNamesFb[ordinal] + ".npu.txt"); - // Convert ordinal to hexadecimal string for pdi kernel id. + // Convert ordinal to hexadecimal string for kernel id. std::stringstream ordinalHex; ordinalHex << "0x" << std::hex << ordinal; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir index 3b1e4a2a8..872ad76cd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir @@ -1,8 +1,8 @@ // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets %s | FileCheck %s --check-prefix=DEFAULT // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-enable-ukernels=all %s | FileCheck %s --check-prefix=ENABLE_UKERNEL -// DEFAULT: hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>) { -// ENABLE_UKERNEL: hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "all"}>) { +// DEFAULT: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "none"}>) { +// ENABLE_UKERNEL: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "all"}>) { func.func @matmul_small(%lhs : tensor<8x16xi32>, %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> { %empty = tensor.empty() : tensor<8x32xi32>