From 996a7bbaaf371ff1947625a2eb4f6b8e590a6f4b Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 10 Oct 2024 23:17:53 -0400 Subject: [PATCH] command buffer --- .../iree-amd-aie/driver/xrt-lite/allocator.cc | 29 ++-- .../iree-amd-aie/driver/xrt-lite/buffer.cc | 160 +++++++++--------- .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 26 ++- .../driver/xrt-lite/command_buffer.cc | 13 +- .../driver/xrt-lite/executable.cc | 23 +-- .../iree-amd-aie/driver/xrt-lite/executable.h | 3 +- .../driver/xrt-lite/nop_executable_cache.h | 2 +- .../driver/xrt-lite/shim/linux/kmq/device.cpp | 3 +- .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 4 +- 9 files changed, 141 insertions(+), 122 deletions(-) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc index 25e45939c..2211d2103 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -125,26 +125,9 @@ struct iree_hal_xrt_lite_allocator { this->query_buffer_compatibility(&compat_params, &allocation_size); if (!iree_all_bits_set(compatibility, IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { - // TODO(benvanik): make a helper for this. -#if IREE_STATUS_MODE - iree_bitfield_string_temp_t temp0, temp1, temp2; - iree_string_view_t memory_type_str = - iree_hal_memory_type_format(params->type, &temp0); - iree_string_view_t usage_str = - iree_hal_buffer_usage_format(params->usage, &temp1); - iree_string_view_t compatibility_str = - iree_hal_buffer_compatibility_format(compatibility, &temp2); - return iree_make_status( - IREE_STATUS_INVALID_ARGUMENT, - "allocator cannot allocate a buffer with the given parameters; " - "memory_type=%.*s, usage=%.*s, compatibility=%.*s", - (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size, - usage_str.data, (int)compatibility_str.size, compatibility_str.data); -#else return iree_make_status( IREE_STATUS_INVALID_ARGUMENT, "allocator cannot allocate a buffer with the given parameters"); -#endif // IREE_STATUS_MODE } // TODO(null): allocate the underlying device memory. @@ -156,10 +139,18 @@ struct iree_hal_xrt_lite_allocator { // just wrapping those device pointers in the HAL buffer type. Other // implementations that require more tracking can provide their own buffer // types that do such tracking for them. - (void)this; + + uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; + // if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) { + // flags = XCL_BO_FLAGS_CACHEABLE; + // } else if (iree_all_bits_set(params->type, + // IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE)) { + // // TODO(max): the test here isn't specific enough + // flags = XCL_BO_FLAGS_EXECBUF; + // } std::unique_ptr bo = - shim_device->alloc_bo(allocation_size, XCL_BO_FLAGS_HOST_ONLY); + shim_device->alloc_bo(allocation_size, flags); iree_hal_buffer_t* buffer = nullptr; iree_status_t status = iree_hal_xrt_lite_buffer_wrap( std::move(bo), reinterpret_cast(this), diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc index 0b1c62523..7f92c9811 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -9,92 +9,87 @@ #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" #include "iree-amd-aie/driver/xrt-lite/util.h" -struct iree_hal_xrt_lite_buffer { - iree_hal_buffer_t base; - std::unique_ptr bo; - iree_hal_buffer_release_callback_t release_callback; - - iree_status_t map_range(iree_hal_mapping_mode_t mapping_mode, - iree_hal_memory_access_t memory_access, - iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( - iree_hal_buffer_memory_type( - reinterpret_cast(this)), - IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); - IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( - iree_hal_buffer_allowed_usage( - reinterpret_cast(this)), - mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT - ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT - : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); - - // TODO(null): perform mapping as described. Note that local-to-buffer range - // adjustment may be required. The resulting mapping is populated with - // standard information such as contents indicating the host addressable - // memory range of the mapped buffer and implementation-specific information - // if additional resources are required. iree_hal_buffer_emulated_map_range - // can be used by implementations that have no way of providing host - // pointers at a large cost (alloc + device->host transfer on map and - // host->device transfer + dealloc on umap). Try not to use that. - void* host_ptr = this->bo->map(); - IREE_ASSERT(host_ptr != - nullptr); // Should be guaranteed by previous checks. - uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; - iree_status_t status = - this->invalidate_range(local_byte_offset, local_byte_length); - // If we mapped for discard scribble over the bytes. This is not a mandated - // behavior but it will make debugging issues easier. Alternatively for heap - // buffers we could reallocate them such that ASAN yells, but that would - // only work if the entire buffer was discarded. +iree_status_t iree_hal_xrt_lite_buffer::map_range( + iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type( + reinterpret_cast(this)), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_IF_ERROR(iree_hal_buffer_validate_usage( + iree_hal_buffer_allowed_usage( + reinterpret_cast(this)), + mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT + ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT + : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + + // TODO(null): perform mapping as described. Note that local-to-buffer range + // adjustment may be required. The resulting mapping is populated with + // standard information such as contents indicating the host addressable + // memory range of the mapped buffer and implementation-specific information + // if additional resources are required. iree_hal_buffer_emulated_map_range + // can be used by implementations that have no way of providing host + // pointers at a large cost (alloc + device->host transfer on map and + // host->device transfer + dealloc on umap). Try not to use that. + void* host_ptr = this->bo->map(); + IREE_ASSERT(host_ptr != nullptr); // Should be guaranteed by previous checks. + uint8_t* data_ptr = (uint8_t*)host_ptr + local_byte_offset; + iree_status_t status = + this->invalidate_range(local_byte_offset, local_byte_length); + // If we mapped for discard scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for heap + // buffers we could reallocate them such that ASAN yells, but that would + // only work if the entire buffer was discarded. #ifndef NDEBUG - if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { - memset(data_ptr, 0xCD, local_byte_length); - } -#endif // !NDEBUG - mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); - return status; + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); } +#endif // !NDEBUG + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + return status; +} - iree_status_t unmap_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - // TODO(null): reverse of map_range. Note that cache invalidation is - // explicit via invalidate_range and need not be performed here. If using - // emulated mapping this must call iree_hal_buffer_emulated_unmap_range to - // release the transient resources. - return this->flush_range(local_byte_offset, local_byte_length); - } +iree_status_t iree_hal_xrt_lite_buffer::unmap_range( + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + // TODO(null): reverse of map_range. Note that cache invalidation is + // explicit via invalidate_range and need not be performed here. If using + // emulated mapping this must call iree_hal_buffer_emulated_unmap_range to + // release the transient resources. + return this->flush_range(local_byte_offset, local_byte_length); +} - iree_status_t invalidate_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - // TODO(null): invalidate the range if required by the buffer. Writes on the - // device are expected to be visible to the host after this returns. - if (IREE_UNLIKELY(!this->bo)) { - return iree_make_status( - IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have device memory attached and cannot be mapped"); - } - this->bo->sync(shim_xdna::direction::device2host, local_byte_length, - local_byte_offset); - return iree_ok_status(); +iree_status_t iree_hal_xrt_lite_buffer::invalidate_range( + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // TODO(null): invalidate the range if required by the buffer. Writes on the + // device are expected to be visible to the host after this returns. + if (IREE_UNLIKELY(!this->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); } + this->bo->sync(shim_xdna::direction::device2host, local_byte_length, + local_byte_offset); + return iree_ok_status(); +} - iree_status_t flush_range(iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - // TODO(null): flush the range if required by the buffer. Writes on the - // host are expected to be visible to the device after this returns. - if (IREE_UNLIKELY(!this->bo)) { - return iree_make_status( - IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have device memory attached and cannot be mapped"); - } - this->bo->sync(shim_xdna::direction::host2device, local_byte_length, - local_byte_offset); - return iree_ok_status(); +iree_status_t iree_hal_xrt_lite_buffer::flush_range( + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + // TODO(null): flush the range if required by the buffer. Writes on the + // host are expected to be visible to the device after this returns. + if (IREE_UNLIKELY(!this->bo)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); } -}; + this->bo->sync(shim_xdna::direction::host2device, local_byte_length, + local_byte_offset); + return iree_ok_status(); +} namespace { extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; @@ -151,6 +146,13 @@ static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { IREE_TRACE_ZONE_END(z0); } +std::unique_ptr iree_hal_xrt_lite_buffer_unwrap( + iree_hal_buffer_t* base_buffer) { + iree_hal_xrt_lite_buffer* buffer = + reinterpret_cast(base_buffer); + return std::move(buffer->bo); +} + #define BUFFER_MEMBER_STATUS(member) \ MEMBER_WRAPPER_STATUS(iree_hal_buffer_t, iree_hal_xrt_lite_buffer, member) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h index 31849a30d..c89f14164 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -11,7 +11,28 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -// Wraps an allocation in an iree_hal_buffer_t. +struct iree_hal_xrt_lite_buffer { + iree_hal_buffer_t base; + std::unique_ptr bo; + iree_hal_buffer_release_callback_t release_callback; + + iree_status_t map_range(iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping); + + iree_status_t unmap_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping); + + iree_status_t invalidate_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length); + + iree_status_t flush_range(iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length); +}; + iree_status_t iree_hal_xrt_lite_buffer_wrap( std::unique_ptr bo, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, @@ -20,4 +41,7 @@ iree_status_t iree_hal_xrt_lite_buffer_wrap( iree_hal_buffer_release_callback_t release_callback, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); +std::unique_ptr iree_hal_xrt_lite_buffer_unwrap( + iree_hal_buffer_t* base_buffer); + #endif // IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc index 59e01fdba..52e124af2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/command_buffer.cc @@ -6,7 +6,11 @@ #include "iree-amd-aie/driver/xrt-lite/command_buffer.h" +#include "buffer.h" #include "iree-amd-aie/driver/xrt-lite/util.h" +#include "shim/linux/kmq/bo.h" + +#define MAX_EXEC_BO_SIZE (4096) namespace { extern const iree_hal_command_buffer_vtable_t @@ -16,6 +20,7 @@ extern const iree_hal_command_buffer_vtable_t struct iree_hal_xrt_lite_command_buffer { iree_hal_command_buffer_t base; iree_allocator_t host_allocator; + iree_hal_buffer_t* exec_buffer; iree_status_t begin() { // TODO(null): if the implementation needs to route the begin to the @@ -252,8 +257,10 @@ iree_status_t iree_hal_xrt_lite_command_buffer_create( // iree_arena_t/block pools. Implementations should also retain any resources // used during the recording and can use iree_hal_resource_set_t* to make that // easier. - iree_status_t status = iree_make_status( - IREE_STATUS_UNIMPLEMENTED, "command buffers not yet implemented"); + iree_hal_buffer_params_t params; + params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE; + iree_status_t status = iree_hal_allocator_allocate_buffer( + device_allocator, params, MAX_EXEC_BO_SIZE, &command_buffer->exec_buffer); if (iree_status_is_ok(status)) { *out_command_buffer = &command_buffer->base; @@ -273,7 +280,7 @@ static void iree_hal_xrt_lite_command_buffer_destroy( // TODO(null): release any implementation resources and // iree_hal_resource_set_t. - + iree_hal_buffer_destroy(command_buffer->exec_buffer); iree_allocator_free(host_allocator, command_buffer); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc index 02180a879..fdb48b0d8 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -14,8 +14,6 @@ #include "iree-amd-aie/schemas/xrt_executable_def_verifier.h" #include "iree/base/api.h" -#define MAX_EXEC_BO_SIZE (4096) - struct iree_hal_xrt_lite_native_executable_t { // Abstract resource used for injecting reference counting and vtable; must be // at offset 0. @@ -33,7 +31,7 @@ extern const iree_hal_executable_vtable_t static iree_hal_xrt_lite_native_executable_t* iree_hal_xrt_lite_native_executable_cast(iree_hal_executable_t* base_value) { IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_xrt_lite_native_executable_vtable); - return (iree_hal_xrt_lite_native_executable_t*)base_value; + return reinterpret_cast(base_value); } // Verifies the structure of the flatbuffer so that we can avoid doing so during @@ -151,12 +149,11 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( entry_point_count * sizeof(executable->entry_points[0]) + total_entry_point_name_chars; IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, - iree_allocator_malloc(host_allocator, total_size, (void**)&executable)); - IREE_TRACE( - char* string_table_buffer = - (char*)((char*)executable + sizeof(*executable) + - entry_point_count * sizeof(executable->entry_points[0]))); + z0, iree_allocator_malloc(host_allocator, total_size, + reinterpret_cast(&executable))); + IREE_TRACE(char* string_table_buffer = reinterpret_cast( + reinterpret_cast(executable) + sizeof(*executable) + + entry_point_count * sizeof(executable->entry_points[0]))); iree_hal_resource_initialize(&iree_hal_xrt_lite_native_executable_vtable, &executable->resource); @@ -180,8 +177,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); xrt::xclbin xclbin = xrt::xclbin(xclbinVector); - std::unique_ptr hw_ctx = - shim_device->create_hw_context(xclbin); + params->context = shim_device->create_hw_context(xclbin); uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); @@ -194,9 +190,6 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( size_t ctrl_code_size = num_instr * sizeof(uint32_t); params->bo_ctrl_code = shim_device->alloc_bo(ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); - params->bo_exec_buf = - shim_device->alloc_bo(MAX_EXEC_BO_SIZE, XCL_BO_FLAGS_EXECBUF); - uint32_t* instr_buffer = static_cast(params->bo_ctrl_code->map()); memcpy(instr_buffer, asm_inst, ctrl_code_size); @@ -231,7 +224,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_create( }); } - *out_executable = (iree_hal_executable_t*)executable; + *out_executable = reinterpret_cast(executable); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h index 7310a103b..b70c266cc 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -24,7 +24,6 @@ extern "C" { struct iree_hal_xrt_lite_kernel_params_t { std::unique_ptr context; std::unique_ptr bo_ctrl_code; - std::unique_ptr bo_exec_buf; // Number of assembly instructions argument to the kernel uint32_t num_instr; // number of instructions IREE_TRACE(iree_string_view_t kernel_name;) @@ -45,7 +44,7 @@ iree_status_t iree_hal_xrt_lite_native_executable_entry_point_kernel_params( iree_hal_xrt_lite_kernel_params_t* out_params); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h index 8b0ed658e..251119fdd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -27,7 +27,7 @@ iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( iree_hal_executable_cache_t** out_executable_cache); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index f373e9b77..0e521e4f2 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -128,8 +128,9 @@ pdev::~pdev() { } void pdev::ioctl(unsigned long cmd, void *arg) const { - if (::ioctl(m_dev_fd, cmd, arg) == -1) + if (::ioctl(m_dev_fd, cmd, arg) == -1) { shim_err(errno, "%s IOCTL failed", ioctl_cmd2name(cmd).c_str()); + } } void *pdev::mmap(void *addr, size_t len, int prot, int flags, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp index 4b0cacb95..868b2eb0e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -71,7 +71,9 @@ void hw_q::submit_wait(const std::vector &fences) { void hw_q::submit_signal(const fence_handle *f) { f->submit_signal(m_hwctx); } -hw_q::~hw_q() { shim_debug("Destroying KMQ HW queue"); } +hw_q::~hw_q() { + shim_debug("Destroying KMQ HW queue"); +} void hw_q::issue_command(bo *cmd_bo) { // Assuming 1024 max args per cmd bo