diff --git a/README.md b/README.md index 78ebaf80b..6660f1f1f 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ If the sample runs without generating errors, the installation is complete. #### Known Issues +* The image extension is currently not supported for discrete GPUs. An image extension library is not provided in the binary package. The standard hsa_ext_image.h extension include file is provided for reference. * Each HSA process creates and internal DMA queue, but there is a system-wide limit of four DMA queues. The fifth simultaneous HSA process will fail hsa_init() with HSA_STATUS_ERROR_OUT_OF_RESOURCES. To run an unlimited number of simultaneous HSA processes, set the environment variable HSA_ENABLE_SDMA=0. #### Disclaimer diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index 8df54eebf..d45649e93 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -1035,7 +1035,7 @@ hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr) { return amdExtTable->hsa_amd_interop_unmap_buffer_fn(ptr); } -// Use the function pointer from local instance Image Extension +// Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_image_create( hsa_agent_t agent, const hsa_ext_image_descriptor_t *image_descriptor, @@ -1047,3 +1047,31 @@ hsa_status_t HSA_API hsa_amd_image_create( image_layout, image_data, access_permission, image); } +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_pointer_info(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), + uint32_t* num_agents_accessible, hsa_agent_t** accessible) { + return amdExtTable->hsa_amd_pointer_info_fn(ptr, info, alloc, num_agents_accessible, accessible); +} + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_pointer_info_set_userdata(void* ptr, void* userptr) { + return amdExtTable->hsa_amd_pointer_info_set_userdata_fn(ptr, userptr); +} + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_ipc_memory_create(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) { + return amdExtTable->hsa_amd_ipc_memory_create_fn(ptr, len, handle); +} + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_ipc_memory_attach(const hsa_amd_ipc_memory_t* ipc, size_t len, + uint32_t num_agents, const hsa_agent_t* mapping_agents, + void** mapped_ptr) { + return amdExtTable->hsa_amd_ipc_memory_attach_fn(ipc, len, num_agents, mapping_agents, + mapped_ptr); +} + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_ipc_memory_detach(void* mapped_ptr) { + return amdExtTable->hsa_amd_ipc_memory_detach_fn(mapped_ptr); +} diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h index 7f8410af5..1932eaa90 100644 --- a/src/core/inc/amd_aql_queue.h +++ b/src/core/inc/amd_aql_queue.h @@ -70,43 +70,43 @@ class AqlQueue : public core::Queue, public core::Signal { bool IsValid() const { return valid_; } /// @brief Queue interfaces - hsa_status_t Inactivate(); + hsa_status_t Inactivate() override; /// @brief Atomically reads the Read index of with Acquire semantics /// /// @return uint64_t Value of read index - uint64_t LoadReadIndexAcquire(); + uint64_t LoadReadIndexAcquire() override; /// @brief Atomically reads the Read index of with Relaxed semantics /// /// @return uint64_t Value of read index - uint64_t LoadReadIndexRelaxed(); + uint64_t LoadReadIndexRelaxed() override; /// @brief Atomically reads the Write index of with Acquire semantics /// /// @return uint64_t Value of write index - uint64_t LoadWriteIndexAcquire(); + uint64_t LoadWriteIndexAcquire() override; /// @brief Atomically reads the Write index of with Relaxed semantics /// /// @return uint64_t Value of write index - uint64_t LoadWriteIndexRelaxed(); + uint64_t LoadWriteIndexRelaxed() override; /// @brief This operation is illegal - void StoreReadIndexRelaxed(uint64_t value) { assert(false); } + void StoreReadIndexRelaxed(uint64_t value) override { assert(false); } /// @brief This operation is illegal - void StoreReadIndexRelease(uint64_t value) { assert(false); } + void StoreReadIndexRelease(uint64_t value) override { assert(false); } /// @brief Atomically writes the Write index of with Relaxed semantics /// /// @param value New value of write index to update with - void StoreWriteIndexRelaxed(uint64_t value); + void StoreWriteIndexRelaxed(uint64_t value) override; /// @brief Atomically writes the Write index of with Release semantics /// /// @param value New value of write index to update with - void StoreWriteIndexRelease(uint64_t value); + void StoreWriteIndexRelease(uint64_t value) override; /// @brief Compares and swaps Write index using Acquire and Release semantics /// @@ -115,7 +115,7 @@ class AqlQueue : public core::Queue, public core::Signal { /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value); + uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value) override; /// @brief Compares and swaps Write index using Acquire semantics /// @@ -124,7 +124,7 @@ class AqlQueue : public core::Queue, public core::Signal { /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value); + uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value) override; /// @brief Compares and swaps Write index using Relaxed semantics /// @@ -133,7 +133,7 @@ class AqlQueue : public core::Queue, public core::Signal { /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value); + uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value) override; /// @brief Compares and swaps Write index using Release semantics /// @@ -142,35 +142,35 @@ class AqlQueue : public core::Queue, public core::Signal { /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value); + uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value) override; /// @brief Updates the Write index using Acquire and Release semantics /// /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t AddWriteIndexAcqRel(uint64_t value); + uint64_t AddWriteIndexAcqRel(uint64_t value) override; /// @brief Updates the Write index using Acquire semantics /// /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t AddWriteIndexAcquire(uint64_t value); + uint64_t AddWriteIndexAcquire(uint64_t value) override; /// @brief Updates the Write index using Relaxed semantics /// /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t AddWriteIndexRelaxed(uint64_t value); + uint64_t AddWriteIndexRelaxed(uint64_t value) override; /// @brief Updates the Write index using Release semantics /// /// @param value Value of new write index /// /// @return uint64_t Value of write index before the update - uint64_t AddWriteIndexRelease(uint64_t value); + uint64_t AddWriteIndexRelease(uint64_t value) override; /// @brief Set CU Masking /// @@ -179,166 +179,159 @@ class AqlQueue : public core::Queue, public core::Signal { /// @param cu_mask pointer to cu mask /// /// @return hsa_status_t - hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, - const uint32_t* cu_mask); + hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override; // @brief Submits a block of PM4 and waits until it has been executed. void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override; /// @brief This operation is illegal - hsa_signal_value_t LoadRelaxed() { + hsa_signal_value_t LoadRelaxed() override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t LoadAcquire() { + hsa_signal_value_t LoadAcquire() override { assert(false); return 0; } /// @brief Update signal value using Relaxed semantics - void StoreRelaxed(hsa_signal_value_t value); + void StoreRelaxed(hsa_signal_value_t value) override; /// @brief Update signal value using Release semantics - void StoreRelease(hsa_signal_value_t value); + void StoreRelease(hsa_signal_value_t value) override; /// @brief This operation is illegal - hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, - hsa_signal_value_t compare_value, - uint64_t timeout, hsa_wait_state_t wait_hint) { + hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, - hsa_signal_value_t compare_value, - uint64_t timeout, hsa_wait_state_t wait_hint) { + hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) override { assert(false); return 0; } /// @brief This operation is illegal - void AndRelaxed(hsa_signal_value_t value) { assert(false); } + void AndRelaxed(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AndAcquire(hsa_signal_value_t value) { assert(false); } + void AndAcquire(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AndRelease(hsa_signal_value_t value) { assert(false); } + void AndRelease(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AndAcqRel(hsa_signal_value_t value) { assert(false); } + void AndAcqRel(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void OrRelaxed(hsa_signal_value_t value) { assert(false); } + void OrRelaxed(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void OrAcquire(hsa_signal_value_t value) { assert(false); } + void OrAcquire(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void OrRelease(hsa_signal_value_t value) { assert(false); } + void OrRelease(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void OrAcqRel(hsa_signal_value_t value) { assert(false); } + void OrAcqRel(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void XorRelaxed(hsa_signal_value_t value) { assert(false); } + void XorRelaxed(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void XorAcquire(hsa_signal_value_t value) { assert(false); } + void XorAcquire(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void XorRelease(hsa_signal_value_t value) { assert(false); } + void XorRelease(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void XorAcqRel(hsa_signal_value_t value) { assert(false); } + void XorAcqRel(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AddRelaxed(hsa_signal_value_t value) { assert(false); } + void AddRelaxed(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AddAcquire(hsa_signal_value_t value) { assert(false); } + void AddAcquire(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AddRelease(hsa_signal_value_t value) { assert(false); } + void AddRelease(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void AddAcqRel(hsa_signal_value_t value) { assert(false); } + void AddAcqRel(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void SubRelaxed(hsa_signal_value_t value) { assert(false); } + void SubRelaxed(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void SubAcquire(hsa_signal_value_t value) { assert(false); } + void SubAcquire(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void SubRelease(hsa_signal_value_t value) { assert(false); } + void SubRelease(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - void SubAcqRel(hsa_signal_value_t value) { assert(false); } + void SubAcqRel(hsa_signal_value_t value) override { assert(false); } /// @brief This operation is illegal - hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) { + hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) { + hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t ExchRelease(hsa_signal_value_t value) { + hsa_signal_value_t ExchRelease(hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) { + hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, - hsa_signal_value_t value) { + hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, - hsa_signal_value_t value) { + hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t CasRelease(hsa_signal_value_t expected, - hsa_signal_value_t value) { + hsa_signal_value_t CasRelease(hsa_signal_value_t expected, hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, - hsa_signal_value_t value) { + hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, hsa_signal_value_t value) override { assert(false); return 0; } /// @brief This operation is illegal - hsa_signal_value_t* ValueLocation() const { + hsa_signal_value_t* ValueLocation() const override { assert(false); return NULL; } /// @brief This operation is illegal - HsaEvent* EopEvent() { + HsaEvent* EopEvent() override { assert(false); return NULL; } @@ -350,7 +343,7 @@ class AqlQueue : public core::Queue, public core::Signal { void operator delete(void*, void*) {} protected: - bool _IsA(rtti_t id) const { return id == &rtti_id_; } + bool _IsA(rtti_t id) const override { return id == &rtti_id_; } private: uint32_t ComputeRingBufferMinPkts(); diff --git a/src/core/inc/amd_hsa_code.hpp b/src/core/inc/amd_hsa_code.hpp index 644bd7621..030b53a9b 100644 --- a/src/core/inc/amd_hsa_code.hpp +++ b/src/core/inc/amd_hsa_code.hpp @@ -263,7 +263,7 @@ namespace code { amd::elf::Section* HsaText() { assert(hsatext); return hsatext; } const amd::elf::Section* HsaText() const { assert(hsatext); return hsatext; } amd::elf::SymbolTable* Symtab() { assert(img); return img->symtab(); } - uint16_t Machine() { return img->Machine(); } + uint16_t Machine() const { return img->Machine(); } AmdHsaCode(bool combineDataSegments = true); virtual ~AmdHsaCode(); @@ -319,8 +319,8 @@ namespace code { Symbol* AddKernelDefinition(const std::string& name, const void* isa, size_t isa_size); - size_t DataSegmentCount() { return dataSegments.size(); } - Segment* DataSegment(size_t i) { return dataSegments[i]; } + size_t DataSegmentCount() const { return dataSegments.size(); } + Segment* DataSegment(size_t i) const { return dataSegments[i]; } size_t DataSectionCount() { return dataSections.size(); } Section* DataSection(size_t i) { return dataSections[i]; } diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp index 562456a63..8cd2acc1d 100644 --- a/src/core/inc/amd_hsa_loader.hpp +++ b/src/core/inc/amd_hsa_loader.hpp @@ -74,9 +74,9 @@ typedef hsa_executable_symbol_t hsa_symbol_t; typedef hsa_executable_symbol_info_t hsa_symbol_info_t; /// @brief Loaded code object attributes. -enum hsa_loaded_code_object_info_t { - HSA_LOADED_CODE_OBJECT_INFO_ELF_IMAGE = 0, - HSA_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE = 1 +enum amd_loaded_code_object_info_t { + AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE = 0, + AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE = 1 }; /// @brief Loaded segment handle. @@ -200,7 +200,7 @@ class LoadedCodeObject { virtual ~LoadedCodeObject() {} - virtual bool GetInfo(hsa_loaded_code_object_info_t attribute, void *value) = 0; + virtual bool GetInfo(amd_loaded_code_object_info_t attribute, void *value) = 0; virtual hsa_status_t IterateLoadedSegments( hsa_status_t (*callback)( @@ -282,16 +282,14 @@ class Executable { hsa_agent_t agent, hsa_code_object_t code_object, const char *options, - hsa_loaded_code_object_t *loaded_code_object = nullptr, - bool load_legacy = true) = 0; + hsa_loaded_code_object_t *loaded_code_object = nullptr) = 0; virtual hsa_status_t LoadCodeObject( hsa_agent_t agent, hsa_code_object_t code_object, size_t code_object_size, const char *options, - hsa_loaded_code_object_t *loaded_code_object = nullptr, - bool load_legacy = true) = 0; + hsa_loaded_code_object_t *loaded_code_object = nullptr) = 0; virtual hsa_status_t Freeze(const char *options) = 0; @@ -401,6 +399,10 @@ class Loader { hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, size_t *num_segment_descriptors) = 0; + /// @brief Finds the handle of executable to which @p device_address + /// belongs. Return NULL handle if device address is invalid. + virtual hsa_executable_t FindExecutable(uint64_t device_address) = 0; + /// @brief Returns host address given @p device_address. If @p device_address /// is already host address, returns null pointer. If @p device_address is /// invalid address, returns null pointer. diff --git a/src/core/inc/amd_loader_context.hpp b/src/core/inc/amd_loader_context.hpp index 02d73ec35..27830ff36 100644 --- a/src/core/inc/amd_loader_context.hpp +++ b/src/core/inc/amd_loader_context.hpp @@ -69,23 +69,19 @@ class LoaderContext final: public hsa::loader::Context { bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; - bool ImageExtensionSupported(); + bool ImageExtensionSupported() override; - hsa_status_t ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle); + hsa_status_t ImageCreate(hsa_agent_t agent, hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t* image_descriptor, + const void* image_data, hsa_ext_image_t* image_handle) override; - hsa_status_t ImageDestroy(hsa_agent_t agent, hsa_ext_image_t image_handle); + hsa_status_t ImageDestroy(hsa_agent_t agent, hsa_ext_image_t image_handle) override; - hsa_status_t SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle); + hsa_status_t SamplerCreate(hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t* sampler_descriptor, + hsa_ext_sampler_t* sampler_handle) override; - hsa_status_t SamplerDestroy(hsa_agent_t agent, hsa_ext_sampler_t sampler_handle); + hsa_status_t SamplerDestroy(hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; private: LoaderContext(const LoaderContext&); diff --git a/src/core/inc/host_queue.h b/src/core/inc/host_queue.h index 091f40c93..d405799d9 100644 --- a/src/core/inc/host_queue.h +++ b/src/core/inc/host_queue.h @@ -56,90 +56,89 @@ class HostQueue : public Queue { ~HostQueue(); - hsa_status_t Inactivate() { return HSA_STATUS_SUCCESS; } + hsa_status_t Inactivate() override { return HSA_STATUS_SUCCESS; } - uint64_t LoadReadIndexAcquire() { + uint64_t LoadReadIndexAcquire() override { return atomic::Load(&amd_queue_.read_dispatch_id, std::memory_order_acquire); } - uint64_t LoadReadIndexRelaxed() { + uint64_t LoadReadIndexRelaxed() override { return atomic::Load(&amd_queue_.read_dispatch_id, std::memory_order_relaxed); } - uint64_t LoadWriteIndexAcquire() { + uint64_t LoadWriteIndexAcquire() override { return atomic::Load(&amd_queue_.write_dispatch_id, std::memory_order_acquire); } - uint64_t LoadWriteIndexRelaxed() { + uint64_t LoadWriteIndexRelaxed() override { return atomic::Load(&amd_queue_.write_dispatch_id, std::memory_order_relaxed); } - void StoreReadIndexRelaxed(uint64_t value) { + void StoreReadIndexRelaxed(uint64_t value) override { atomic::Store(&amd_queue_.read_dispatch_id, value, std::memory_order_relaxed); } - void StoreReadIndexRelease(uint64_t value) { + void StoreReadIndexRelease(uint64_t value) override { atomic::Store(&amd_queue_.read_dispatch_id, value, std::memory_order_release); } - void StoreWriteIndexRelaxed(uint64_t value) { + void StoreWriteIndexRelaxed(uint64_t value) override { atomic::Store(&amd_queue_.write_dispatch_id, value, std::memory_order_relaxed); } - void StoreWriteIndexRelease(uint64_t value) { + void StoreWriteIndexRelease(uint64_t value) override { atomic::Store(&amd_queue_.write_dispatch_id, value, std::memory_order_release); } - uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value) { + uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value) override { return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, std::memory_order_acq_rel); } - uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value) { + uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value) override { return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, std::memory_order_acquire); } - uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value) { + uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value) override { return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, std::memory_order_relaxed); } - uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value) { + uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value) override { return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, std::memory_order_release); } - uint64_t AddWriteIndexAcqRel(uint64_t value) { + uint64_t AddWriteIndexAcqRel(uint64_t value) override { return atomic::Add(&amd_queue_.write_dispatch_id, value, std::memory_order_acq_rel); } - uint64_t AddWriteIndexAcquire(uint64_t value) { + uint64_t AddWriteIndexAcquire(uint64_t value) override { return atomic::Add(&amd_queue_.write_dispatch_id, value, std::memory_order_acquire); } - uint64_t AddWriteIndexRelaxed(uint64_t value) { + uint64_t AddWriteIndexRelaxed(uint64_t value) override { return atomic::Add(&amd_queue_.write_dispatch_id, value, std::memory_order_relaxed); } - uint64_t AddWriteIndexRelease(uint64_t value) { + uint64_t AddWriteIndexRelease(uint64_t value) override { return atomic::Add(&amd_queue_.write_dispatch_id, value, std::memory_order_release); } - hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, - const uint32_t* cu_mask) { + hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { return HSA_STATUS_ERROR; } diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h index 54f8e3458..763e8553b 100644 --- a/src/core/inc/hsa_ext_amd_impl.h +++ b/src/core/inc/hsa_ext_amd_impl.h @@ -181,6 +181,24 @@ hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr); +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_pointer_info(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), + uint32_t* num_agents_accessible, hsa_agent_t** accessible); + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_pointer_info_set_userdata(void* ptr, void* userdata); + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_ipc_memory_create(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle); + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_ipc_memory_attach(const hsa_amd_ipc_memory_t* handle, size_t len, + uint32_t num_agents, const hsa_agent_t* mapping_agents, + void** mapped_ptr); + +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_ipc_memory_detach(void* mapped_ptr); + } // end of AMD namespace #endif // header guard diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index 8ad74bd1b..c8d00bcb6 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -70,6 +70,11 @@ #define HSA_QUEUE_ALIGN_BYTES 64 #define HSA_PACKET_ALIGN_BYTES 64 +//Avoids include +namespace amd { + class MemoryRegion; +} + namespace core { extern bool g_use_interrupt_wait; @@ -84,6 +89,7 @@ extern bool g_use_interrupt_wait; /// - maintain loader state. /// - monitor asynchronous event from agent. class Runtime { + friend class amd::MemoryRegion; public: /// @brief Structure to describe connectivity between agents. struct LinkInfo { @@ -258,11 +264,22 @@ class Runtime { hsa_status_t InteropUnmap(void* ptr); + hsa_status_t PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), + uint32_t* num_agents_accessible, hsa_agent_t** accessible); + + hsa_status_t SetPtrInfoData(void* ptr, void* userptr); + + hsa_status_t IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle); + + hsa_status_t IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents, + Agent** mapping_agents, void** mapped_ptr); + + hsa_status_t IPCDetach(void* ptr); + const std::vector& cpu_agents() { return cpu_agents_; } const std::vector& gpu_agents() { return gpu_agents_; } - const std::vector& gpu_ids() { return gpu_ids_; } Agent* blit_agent() { return blit_agent_; } @@ -399,7 +416,9 @@ class Runtime { // Mutex object to protect multithreaded access to ::Acquire and ::Release. KernelMutex kernel_lock_; - // Mutex object to protect multithreaded access to ::allocation_map_. + // Mutex object to protect multithreaded access to ::allocation_map_, + // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo + // registered & mapped arrays. KernelMutex memory_lock_; // Array containing tools library handles. @@ -411,6 +430,9 @@ class Runtime { // Agent list containing all compatible GPU agents in the platform. std::vector gpu_agents_; + // Agent map containing all agents indexed by their KFD node IDs. + std::map > agents_by_node_; + // Agent list containing all compatible gpu agent ids in the platform. std::vector gpu_ids_; diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index a1e57a527..3b01d2bc9 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -217,12 +217,10 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, } } - assert(amd_queue_.group_segment_aperture_base_hi != NULL && - "No group region found."); + assert(amd_queue_.group_segment_aperture_base_hi != 0 && "No group region found."); if (core::Runtime::runtime_singleton_->flag().check_flat_scratch()) { - assert(amd_queue_.private_segment_aperture_base_hi != NULL && - "No private region found."); + assert(amd_queue_.private_segment_aperture_base_hi != 0 && "No private region found."); } MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() { @@ -405,11 +403,12 @@ void AqlQueue::StoreRelaxed(hsa_signal_value_t value) { if (legacy_dispatch_id > amd_queue_.max_legacy_doorbell_dispatch_id_plus_1) { // Record the most recent packet index used in a doorbell submission. // This field will be interpreted as a write index upon HW queue connect. - // Must be visible to the HW before sending the doorbell to avoid a race. + // Make ring buffer visible to HW before updating write index. atomic::Store(&amd_queue_.max_legacy_doorbell_dispatch_id_plus_1, - legacy_dispatch_id, std::memory_order_relaxed); + legacy_dispatch_id, std::memory_order_release); // Write the dispatch id to the hardware MMIO doorbell. + // Make write index visible to HW before sending doorbell. if (doorbell_type_ == 0) { // The legacy GFXIP 7 hardware doorbell expects: // 1. Packet index wrapped to a point within the ring buffer @@ -417,18 +416,20 @@ void AqlQueue::StoreRelaxed(hsa_signal_value_t value) { uint64_t queue_size_mask = ((1 + queue_full_workaround_) * amd_queue_.hsa_queue.size) - 1; - *(volatile uint32_t*)signal_.legacy_hardware_doorbell_ptr = - uint32_t((legacy_dispatch_id & queue_size_mask) * - (sizeof(core::AqlPacket) / sizeof(uint32_t))); + atomic::Store(signal_.legacy_hardware_doorbell_ptr, + uint32_t((legacy_dispatch_id & queue_size_mask) * + (sizeof(core::AqlPacket) / sizeof(uint32_t))), + std::memory_order_release); } else if (doorbell_type_ == 1) { - *(volatile uint32_t*)signal_.legacy_hardware_doorbell_ptr = - uint32_t(legacy_dispatch_id); + atomic::Store(signal_.legacy_hardware_doorbell_ptr, + uint32_t(legacy_dispatch_id), std::memory_order_release); } else { assert(false && "Agent has unsupported doorbell semantics"); } } // Release spinlock protecting the legacy doorbell. + // Also ensures timely delivery of (write-combined) doorbell to HW. atomic::Store(&amd_queue_.legacy_doorbell_lock, 0U, std::memory_order_release); } @@ -670,9 +671,10 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { queue->agent_->ReleaseQueueScratch(scratch.queue_base); + uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size; + const core::AqlPacket& pkt = - ((core::AqlPacket*)queue->amd_queue_.hsa_queue - .base_address)[queue->amd_queue_.read_dispatch_id]; + ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; uint32_t scratch_request = pkt.dispatch.private_segment_size; @@ -779,8 +781,8 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { uint32_t slot_idx = uint32_t(write_idx % public_handle()->size); constexpr uint32_t slot_size_b = 0x40; - uint32_t* queue_slot = (uint32_t*)uintptr_t(public_handle()->base_address + - (slot_idx * slot_size_b)); + uint32_t* queue_slot = + (uint32_t*)(uintptr_t(public_handle()->base_address) + (slot_idx * slot_size_b)); // Copy client PM4 command into IB. assert(cmd_size_b < pm4_ib_size_b_ && "PM4 exceeds IB size"); @@ -909,7 +911,7 @@ void AqlQueue::InitScratchSRD() { uint32_t(queue_scratch_.size_per_thread); // Set concurrent wavefront limits only when scratch is being used. - COMPUTE_TMPRING_SIZE tmpring_size = {0}; + COMPUTE_TMPRING_SIZE tmpring_size = {}; if (queue_scratch_.size == 0) { amd_queue_.compute_tmpring_size = tmpring_size.u32All; return; diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp old mode 100644 new mode 100755 diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp old mode 100644 new mode 100755 diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index 77338cf81..95b4f259a 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -469,11 +469,15 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, HsaMemMapFlags map_flag = map_flag_; map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0; - uint64_t alternate_va = 0; - if (!amd::MemoryRegion::MakeKfdMemoryResident( - whitelist_nodes.size(), &whitelist_nodes[0], const_cast(ptr), - size, &alternate_va, map_flag)) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + { + ScopedAcquire lock(&core::Runtime::runtime_singleton_->memory_lock_); + uint64_t alternate_va = 0; + if (!amd::MemoryRegion::MakeKfdMemoryResident( + whitelist_nodes.size(), &whitelist_nodes[0], const_cast(ptr), + size, &alternate_va, map_flag)) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } } for (GpuAgentInt* gpu : whitelist_gpus) { diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index b468e1b6e..66927b5fe 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -299,9 +299,10 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin std::string name; size_t size; }; - static sizes_t sizes[] = {"hsa_ext_images_1_00_pfn_t", sizeof(hsa_ext_images_1_00_pfn_t), - "hsa_ext_finalizer_1_00_pfn_t", sizeof(hsa_ext_finalizer_1_00_pfn_t), - "hsa_ven_amd_loader_1_00_pfn_t", sizeof(hsa_ven_amd_loader_1_00_pfn_t)}; + static sizes_t sizes[] = { + {"hsa_ext_images_1_00_pfn_t", sizeof(hsa_ext_images_1_00_pfn_t)}, + {"hsa_ext_finalizer_1_00_pfn_t", sizeof(hsa_ext_finalizer_1_00_pfn_t)}, + {"hsa_ven_amd_loader_1_00_pfn_t", sizeof(hsa_ven_amd_loader_1_00_pfn_t)}}; static const size_t num_tables = sizeof(sizes) / sizeof(sizes_t); if (minor > 99) return 0; @@ -395,6 +396,7 @@ hsa_status_t hsa_system_get_major_extension_table(uint16_t extension, uint16_t v ext_table.hsa_ven_amd_loader_query_host_address = hsa_ven_amd_loader_query_host_address; ext_table.hsa_ven_amd_loader_query_segment_descriptors = hsa_ven_amd_loader_query_segment_descriptors; + ext_table.hsa_ven_amd_loader_query_executable = hsa_ven_amd_loader_query_executable; memcpy(table, &ext_table, Min(sizeof(ext_table), table_length)); @@ -1925,7 +1927,7 @@ hsa_status_t hsa_executable_load_program_code_object( hsa_code_object_t code_object = {reinterpret_cast(wrapper->code_object_memory)}; return exec->LoadCodeObject( - {0}, code_object, options, loaded_code_object, false); + {0}, code_object, options, loaded_code_object); } hsa_status_t hsa_executable_load_agent_code_object( @@ -1950,7 +1952,7 @@ hsa_status_t hsa_executable_load_agent_code_object( hsa_code_object_t code_object = {reinterpret_cast(wrapper->code_object_memory)}; return exec->LoadCodeObject( - agent, code_object, options, loaded_code_object, false); + agent, code_object, options, loaded_code_object); } hsa_status_t hsa_executable_freeze( diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp index a2d56d032..ee0afc0e4 100644 --- a/src/core/runtime/hsa_api_trace.cpp +++ b/src/core/runtime/hsa_api_trace.cpp @@ -358,6 +358,11 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_memory_fill_fn = AMD::hsa_amd_memory_fill; amd_ext_api.hsa_amd_interop_map_buffer_fn = AMD::hsa_amd_interop_map_buffer; amd_ext_api.hsa_amd_interop_unmap_buffer_fn = AMD::hsa_amd_interop_unmap_buffer; + amd_ext_api.hsa_amd_pointer_info_fn = AMD::hsa_amd_pointer_info; + amd_ext_api.hsa_amd_pointer_info_set_userdata_fn = AMD::hsa_amd_pointer_info_set_userdata; + amd_ext_api.hsa_amd_ipc_memory_create_fn = AMD::hsa_amd_ipc_memory_create; + amd_ext_api.hsa_amd_ipc_memory_attach_fn = AMD::hsa_amd_ipc_memory_attach; + amd_ext_api.hsa_amd_ipc_memory_detach_fn = AMD::hsa_amd_ipc_memory_detach; } class Init { diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index a35d7368c..70daffd6a 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -562,6 +562,7 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents, uint32_t flags, size_t* size, void** ptr, size_t* metadata_size, const void** metadata) { + static const int tinyArraySize=8; IS_OPEN(); IS_BAD_PTR(agents); IS_BAD_PTR(size); @@ -569,9 +570,9 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents, if (flags != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; if (num_agents == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; - core::Agent* short_agents[64]; + core::Agent* short_agents[tinyArraySize]; core::Agent** core_agents = short_agents; - if (num_agents > 64) { + if (num_agents > tinyArraySize) { core_agents = new core::Agent* [num_agents]; if (core_agents == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } @@ -586,7 +587,7 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents, num_agents, core_agents, interop_handle, flags, size, ptr, metadata_size, metadata); - if (num_agents > 64) delete[] core_agents; + if (num_agents > tinyArraySize) delete[] core_agents; return ret; } @@ -596,4 +597,59 @@ hsa_status_t hsa_amd_interop_unmap_buffer(void* ptr) { return HSA_STATUS_SUCCESS; } +hsa_status_t hsa_amd_pointer_info(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), + uint32_t* num_accessible, hsa_agent_t** accessible) { + IS_OPEN(); + IS_BAD_PTR(ptr); + IS_BAD_PTR(info); + return core::Runtime::runtime_singleton_->PtrInfo(ptr, info, alloc, num_accessible, accessible); +} + +hsa_status_t hsa_amd_pointer_info_set_userdata(void* ptr, void* userdata) { + IS_OPEN(); + IS_BAD_PTR(ptr); + return core::Runtime::runtime_singleton_->SetPtrInfoData(ptr, userdata); +} + +hsa_status_t hsa_amd_ipc_memory_create(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) { + IS_OPEN(); + IS_BAD_PTR(ptr); + IS_BAD_PTR(handle); + return core::Runtime::runtime_singleton_->IPCCreate(ptr, len, handle); +} + +hsa_status_t hsa_amd_ipc_memory_attach(const hsa_amd_ipc_memory_t* ipc, size_t len, + uint32_t num_agents, const hsa_agent_t* mapping_agents, + void** mapped_ptr) { + static const int tinyArraySize = 8; + IS_OPEN(); + IS_BAD_PTR(mapped_ptr); + if (num_agents != 0) IS_BAD_PTR(mapping_agents); + + core::Agent** core_agents = nullptr; + if (num_agents > tinyArraySize) + core_agents = new core::Agent*[num_agents]; + else + core_agents = (core::Agent**)alloca(sizeof(core::Agent*) * num_agents); + if (core_agents == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + MAKE_SCOPE_GUARD([&]() { + if (num_agents > tinyArraySize) delete[] core_agents; + }); + + for (int i = 0; i < num_agents; i++) { + core::Agent* device = core::Agent::Convert(mapping_agents[i]); + IS_VALID(device); + core_agents[i] = device; + } + + return core::Runtime::runtime_singleton_->IPCAttach(ipc, len, num_agents, core_agents, + mapped_ptr); +} + +hsa_status_t hsa_amd_ipc_memory_detach(void* mapped_ptr) { + IS_OPEN(); + IS_BAD_PTR(mapped_ptr); + return core::Runtime::runtime_singleton_->IPCDetach(mapped_ptr); +} + } // end of AMD namespace diff --git a/src/core/runtime/hsa_ven_amd_loader.cpp b/src/core/runtime/hsa_ven_amd_loader.cpp index ba951053e..c95b62d14 100644 --- a/src/core/runtime/hsa_ven_amd_loader.cpp +++ b/src/core/runtime/hsa_ven_amd_loader.cpp @@ -80,3 +80,24 @@ hsa_status_t HSA_API hsa_ven_amd_loader_query_segment_descriptors( // Arguments are checked by the loader. return Runtime::runtime_singleton_->loader()->QuerySegmentDescriptors(segment_descriptors, num_segment_descriptors); } + +hsa_status_t HSA_API hsa_ven_amd_loader_query_executable( + const void *device_address, + hsa_executable_t *executable) { + + if (false == core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if ((nullptr == device_address) || (nullptr == executable)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + uint64_t udaddr = reinterpret_cast(device_address); + hsa_executable_t exec = Runtime::runtime_singleton_->loader()->FindExecutable(udaddr); + if (0 == exec.handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + *executable = exec; + return HSA_STATUS_SUCCESS; +} diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp old mode 100644 new mode 100755 index d9fabab78..06462f2c7 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -65,7 +65,7 @@ #define HSA_VERSION_MAJOR 1 #define HSA_VERSION_MINOR 1 -const char rocrbuildid[] = "ROCR BUILD ID: " STRING(ROCR_BUILD_ID); +const char rocrbuildid[] __attribute__((unused)) = "ROCR BUILD ID: " STRING(ROCR_BUILD_ID); namespace core { bool g_use_interrupt_wait = true; @@ -142,6 +142,10 @@ bool Runtime::IsOpen() { } void Runtime::RegisterAgent(Agent* agent) { + // Record the agent in the node-to-agent reverse lookup table. + agents_by_node_[agent->node_id()].push_back(agent); + + // Process agent as a cpu or gpu device. if (agent->device_type() == Agent::DeviceType::kAmdCpuDevice) { cpu_agents_.push_back(agent); @@ -230,6 +234,8 @@ void Runtime::RegisterAgent(Agent* agent) { } void Runtime::DestroyAgents() { + agents_by_node_.clear(); + std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject()); gpu_agents_.clear(); @@ -302,11 +308,11 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent, hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size, MemoryRegion::AllocateFlags alloc_flags, void** address) { + ScopedAcquire lock(&memory_lock_); hsa_status_t status = region->Allocate(size, alloc_flags, address); // Track the allocation result so that it could be freed properly. if (status == HSA_STATUS_SUCCESS) { - ScopedAcquire lock(&memory_lock_); allocation_map_[*address] = AllocationRegion(region, size); } @@ -320,22 +326,18 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { const MemoryRegion* region = NULL; size_t size = 0; - { - ScopedAcquire lock(&memory_lock_); + ScopedAcquire lock(&memory_lock_); - std::map::const_iterator it = - allocation_map_.find(ptr); - - if (it == allocation_map_.end()) { - assert(false && "Can't find address in allocation map"); - return HSA_STATUS_ERROR; - } + std::map::const_iterator it = allocation_map_.find(ptr); - region = it->second.region; - size = it->second.size; - - allocation_map_.erase(it); + if (it == allocation_map_.end()) { + assert(false && "Can't find address in allocation map"); + return HSA_STATUS_ERROR; } + region = it->second.region; + size = it->second.size; + + allocation_map_.erase(it); return region->Free(ptr, size); } @@ -463,8 +465,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents, { ScopedAcquire lock(&memory_lock_); - std::map::const_iterator it = - allocation_map_.find(ptr); + std::map::const_iterator it = allocation_map_.find(ptr); if (it == allocation_map_.end()) { return HSA_STATUS_ERROR; @@ -590,16 +591,17 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents, int interop_handle, uint32_t flags, size_t* size, void** ptr, size_t* metadata_size, const void** metadata) { + static const int tinyArraySize=8; HsaGraphicsResourceInfo info; - HSAuint32 short_nodes[64]; + HSAuint32 short_nodes[tinyArraySize]; HSAuint32* nodes = short_nodes; - if (num_agents > 64) { + if (num_agents > tinyArraySize) { nodes = new HSAuint32[num_agents]; if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } MAKE_SCOPE_GUARD([&]() { - if (num_agents > 64) delete[] nodes; + if (num_agents > tinyArraySize) delete[] nodes; }); for (int i = 0; i < num_agents; i++) @@ -618,10 +620,11 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents, &altAddress, map_flags, num_agents, nodes) != HSAKMT_STATUS_SUCCESS) { map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; - if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, - &altAddress, map_flags, num_agents, - nodes) != HSAKMT_STATUS_SUCCESS) + if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, &altAddress, map_flags, + num_agents, nodes) != HSAKMT_STATUS_SUCCESS) { + hsaKmtDeregisterMemory(info.MemoryAddress); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } } if (metadata_size != NULL) *metadata_size = info.MetadataSizeInBytes; @@ -633,8 +636,7 @@ hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents, return HSA_STATUS_SUCCESS; } -hsa_status_t Runtime::InteropUnmap(void* ptr) -{ +hsa_status_t Runtime::InteropUnmap(void* ptr) { if(hsaKmtUnmapMemoryToGPU(ptr)!=HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; if(hsaKmtDeregisterMemory(ptr)!=HSAKMT_STATUS_SUCCESS) @@ -642,6 +644,149 @@ hsa_status_t Runtime::InteropUnmap(void* ptr) return HSA_STATUS_SUCCESS; } +hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), + uint32_t* num_agents_accessible, hsa_agent_t** accessible) { + HsaPointerInfo thunkInfo; + uint32_t* mappedNodes; + + // check output struct is at least as large as the first info revision. + if (info->size < sizeof(struct hsa_amd_pointer_info_v1_s)) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + bool returnListData = + ((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr)); + if (returnListData) { + size_t max_agents = cpu_agents_.size() + gpu_agents_.size(); + mappedNodes = (uint32_t*)alloca(max_agents * sizeof(uint32_t)); + // memory_lock protects access to the NMappedNodes array since this changes with calls to memory + // APIs. + ScopedAcquire lock(&memory_lock_); + hsaKmtQueryPointerInfo(ptr, &thunkInfo); + assert(thunkInfo.NMappedNodes <= max_agents && + "PointerInfo: Thunk returned more than all agents in NMappedNodes."); + memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t)); + } else { + hsaKmtQueryPointerInfo(ptr, &thunkInfo); + } + + static_assert((int)HSA_POINTER_UNKNOWN == (int)HSA_EXT_POINTER_TYPE_UNKNOWN, + "Thunk pointer info mismatch"); + static_assert((int)HSA_POINTER_ALLOCATED == (int)HSA_EXT_POINTER_TYPE_HSA, + "Thunk pointer info mismatch"); + static_assert((int)HSA_POINTER_REGISTERED_USER == (int)HSA_EXT_POINTER_TYPE_LOCKED, + "Thunk pointer info mismatch"); + static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS, + "Thunk pointer info mismatch"); + + info->size = Min(info->size, sizeof(struct hsa_amd_pointer_info_v1_s)); + info->type = (hsa_amd_pointer_type_t)thunkInfo.Type; + info->agentBaseAddress = (void*)thunkInfo.GPUAddress; + info->hostBaseAddress = thunkInfo.CPUAddress; + info->sizeInBytes = thunkInfo.SizeInBytes; + info->userData = thunkInfo.UserData; + + if (returnListData) { + uint32_t count = 0; + for (int i = 0; i < thunkInfo.NMappedNodes; i++) { + assert(mappedNodes[i] < agents_by_node_.size() && + "PointerInfo: Invalid node ID returned from thunk."); + count += agents_by_node_[mappedNodes[i]].size(); + } + + *accessible = (hsa_agent_t*)alloc(sizeof(hsa_agent_t) * count); + if ((*accessible) == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + *num_agents_accessible = count; + + uint32_t index = 0; + for (int i = 0; i < thunkInfo.NMappedNodes; i++) { + auto& list = agents_by_node_[mappedNodes[i]]; + for (int j = 0; j < list.size(); j++) { + (*accessible)[index] = list[j]->public_handle(); + index++; + } + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) { + if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_SUCCESS; + else + return HSA_STATUS_ERROR_INVALID_ARGUMENT; +} + +hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) { + static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle), + "Thunk IPC mismatch."); + if (hsaKmtShareMemory(ptr, len, (HsaSharedMemoryHandle*)handle) == HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_SUCCESS; + else + return HSA_STATUS_ERROR_INVALID_ARGUMENT; +} + +hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents, + Agent** agents, void** mapped_ptr) { + static const int tinyArraySize = 8; + void* importAddress; + HSAuint64 importSize; + HSAuint64 altAddress; + if (num_agents == 0) { + if (hsaKmtRegisterSharedHandle(reinterpret_cast(handle), + &importAddress, &importSize) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) { + hsaKmtDeregisterMemory(importAddress); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + *mapped_ptr = importAddress; + return HSA_STATUS_SUCCESS; + } + + HSAuint32* nodes = nullptr; + if (num_agents > tinyArraySize) + nodes = new HSAuint32[num_agents]; + else + nodes = (HSAuint32*)alloca(sizeof(HSAuint32) * num_agents); + if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + MAKE_SCOPE_GUARD([&]() { + if (num_agents > tinyArraySize) delete[] nodes; + }); + + for (int i = 0; i < num_agents; i++) + agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]); + + if (hsaKmtRegisterSharedHandleToNodes(reinterpret_cast(handle), + &importAddress, &importSize, num_agents, + nodes) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + HsaMemMapFlags map_flags; + map_flags.Value = 0; + map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB; + if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents, + nodes) != HSAKMT_STATUS_SUCCESS) { + map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; + if (hsaKmtMapMemoryToGPUNodes(importAddress, importSize, &altAddress, map_flags, num_agents, + nodes) != HSAKMT_STATUS_SUCCESS) { + hsaKmtDeregisterMemory(importAddress); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + } + + *mapped_ptr = importAddress; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::IPCDetach(void* ptr) { + if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + return HSA_STATUS_SUCCESS; +} + void Runtime::AsyncEventsLoop(void*) { auto& async_events_control_ = runtime_singleton_->async_events_control_; auto& async_events_ = runtime_singleton_->async_events_; diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 2e1fb8867..1e1fe0f83 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -69,6 +69,12 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_INTERRUPT"); enable_interrupt_ = (var == "0") ? false : true; + var = os::GetEnvVar("HSA_ENABLE_THREAD_TRACE"); + enable_thread_trace_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_THREAD_TRACE_MEM_SIZE"); + thread_trace_buff_size_ = atoi(var.c_str()); + var = os::GetEnvVar("HSA_ENABLE_SDMA"); enable_sdma_ = (var == "0") ? false : true; @@ -98,6 +104,9 @@ class Flag { bool enable_interrupt() const { return enable_interrupt_; } + bool enable_thread_trace() const { return enable_thread_trace_; } + bool thread_trace_buff_size() const { return thread_trace_buff_size_; } + bool enable_sdma() const { return enable_sdma_; } bool emulate_aql() const { return emulate_aql_; } @@ -122,6 +131,9 @@ class Flag { bool sdma_wait_idle_; bool enable_queue_fault_message_; + bool enable_thread_trace_; + size_t thread_trace_buff_size_; + uint32_t max_queues_; size_t scratch_mem_size_; diff --git a/src/core/util/utils.h b/src/core/util/utils.h index 4e2fd90e2..7652e30b0 100644 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -64,7 +64,6 @@ typedef uint64_t uint64; #endif #define __forceinline __inline__ __attribute__((always_inline)) -static __forceinline void __debugbreak() { __builtin_trap(); } #define __declspec(x) __attribute__((x)) #undef __stdcall #define __stdcall // __attribute__((__stdcall__)) diff --git a/src/hsacore.so.def b/src/hsacore.so.def index b8d76c517..995f9c117 100644 --- a/src/hsacore.so.def +++ b/src/hsacore.so.def @@ -203,6 +203,11 @@ global: hsa_ext_image_destroy; hsa_ext_sampler_create; hsa_ext_sampler_destroy; + hsa_amd_pointer_info; + hsa_amd_pointer_info_set_userdata; + hsa_amd_ipc_memory_create; + hsa_amd_ipc_memory_attach; + hsa_amd_ipc_memory_detach; local: *; diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h index 9a8f455d2..d4e42ee7b 100644 --- a/src/inc/hsa_api_trace.h +++ b/src/inc/hsa_api_trace.h @@ -140,6 +140,11 @@ struct AmdExtTable { decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn; decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn; decltype(hsa_amd_image_create)* hsa_amd_image_create_fn; + decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn; + decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn; + decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn; + decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn; + decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn; }; // Table to export HSA Core Runtime Apis diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h old mode 100644 new mode 100755 index ab7757389..a273e41ac --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -1140,7 +1140,7 @@ hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); /** - * @brief Sets the first @p num of uint32_t of the block of memory pointed by + * @brief Sets the first @p count of uint32_t of the block of memory pointed by * @p ptr to the specified @p value. * * @param[in] ptr Pointer to the block of memory to fill. @@ -1268,6 +1268,225 @@ hsa_status_t HSA_API hsa_amd_image_create( hsa_ext_image_t *image ); +/** + * @brief Denotes the type of memory in a pointer info query. + */ +typedef enum { + /* + Memory is not known to the HSA driver. Unallocated or unlocked system memory. + */ + HSA_EXT_POINTER_TYPE_UNKNOWN = 0, + /* + Memory was allocated with an HSA memory allocator. + */ + HSA_EXT_POINTER_TYPE_HSA = 1, + /* + System memory which has been locked for use with an HSA agent. + + Memory of this type is normal malloc'd memory and is always accessible to + the CPU. Pointer info queries may not include CPU agents in the accessible + agents list as the CPU has implicit access. + */ + HSA_EXT_POINTER_TYPE_LOCKED = 2, + /* + Memory originated in a graphics component and is shared with ROCr. + */ + HSA_EXT_POINTER_TYPE_GRAPHICS = 3, + /* + Memory has been shared with the local process via ROCr IPC APIs. + */ + HSA_EXT_POINTER_TYPE_IPC = 4 +} hsa_amd_pointer_type_t; + +/** + * @brief Describes a memory allocation known to ROCr. + * Within a ROCr major version this structure can only grow. + */ +typedef struct hsa_amd_pointer_info_v1_s { + /* + Size in bytes of this structure. Used for version control within a major ROCr + revision. Set to sizeof(hsa_amd_pointer_t) prior to calling + hsa_amd_pointer_info. If the runtime supports an older version of pointer + info then size will be smaller on return. Members starting after the return + value of size will not be updated by hsa_amd_pointer_info. + */ + uint32_t size; + /* + The type of allocation referenced. + */ + hsa_amd_pointer_type_t type; + /* + Base address at which non-host agents may access the allocation. + */ + void* agentBaseAddress; + /* + Base address at which the host agent may access the allocation. + */ + void* hostBaseAddress; + /* + Size of the allocation + */ + size_t sizeInBytes; + /* + Application provided value. + */ + void* userData; +} hsa_amd_pointer_info_t; + +/** + * @brief Retrieves information about the allocation referenced by the given + * pointer. Optionally returns the number and list of agents which can + * directly access the allocation. + * + * @param[in] ptr Pointer which references the allocation to retrieve info for. + * + * @param[in, out] info Pointer to structure to be filled with allocation info. + * Data member size must be set to the size of the structure prior to calling + * hsa_amd_pointer_info. On return size will be set to the size of the + * pointer info structure supported by the runtime, if smaller. Members + * beyond the returned value of size will not be updated by the API. + * Must not be NULL. + * + * @param[in] alloc Function pointer to an allocator used to allocate the + * @p accessible array. If NULL @p accessible will not be returned. + * + * @param[out] num_agents_accessible Recieves the count of agents in + * @p accessible. If NULL @p accessible will not be returned. + * + * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc, + * holding the list of agents which may directly access the allocation. + * May be NULL. + * + * @retval HSA_STATUS_SUCCESS Info retrieved successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info. + */ +hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr, + hsa_amd_pointer_info_t* info, + void* (*alloc)(size_t), + uint32_t* num_agents_accessible, + hsa_agent_t** accessible); + +/** + * @brief Associates an arbitrary pointer with an allocation known to ROCr. + * The pointer can be fetched by hsa_amd_pointer_info in the userData field. + * + * @param[in] ptr Pointer to the first byte of an allocation known to ROCr + * with which to associate @p userdata. + * + * @param[in] userdata Abitrary pointer to associate with the allocation. + * + * @retval HSA_STATUS_SUCCESS @p userdata successfully stored. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr. + */ +hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr, + void* userdata); + +/** + * @brief 256-bit process independent identifier for a ROCr shared memory + * allocation. + */ +typedef struct hsa_amd_ipc_memory_s { + uint32_t handle[8]; +} hsa_amd_ipc_memory_t; + +/** + * @brief Prepares an allocation for interprocess sharing and creates a + * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation. A + * handle is valid while the allocation it references remains accessible in + * any process. In general applications should confirm that a shared memory + * region has been attached (via hsa_amd_ipc_memory_attach) in the remote + * process prior to releasing that memory in the local process. + * Repeated calls for the same allocaiton may, but are not required to, return + * unique handles. + * + * @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for + * sharing. + * + * @param[in] len Length in bytes of the allocation to share. + * + * @param[out] handle Process independent identifier referencing the shared + * allocation. + * + * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the + * first byte of an allocation made through ROCr, or len is not the full length + * of the allocation or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len, + hsa_amd_ipc_memory_t* handle); + +/** + * @brief Imports shared memory into the local process and makes it accessible + * by the given agents. If a shared memory handle is attached multiple times + * in a process each attach may return a different address. Each returned + * address is refcounted and requires a matching number of calls to + * hsa_amd_ipc_memory_detach to release the shared memory mapping. + * + * @param[in] handle Pointer to the identifier for the shared memory. + * + * @param[in] len Length of the shared memory to import. + * Reserved. Must be the full length of the shared allocation in this version. + * + * @param[in] num_agents Count of agents in @p mapping_agents. + * May be zero if all agents are to be allowed access. + * + * @param[in] mapping_agents List of agents to access the shared memory. + * Ignored if @p num_agents is zero. + * + * @param[out] mapped_ptr Recieves a process local pointer to the shared memory. + * + * @retval HSA_STATUS_SUCCESS if memory is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is + * incorrect, @p mapped_ptr is NULL, or some agent for which access was + * requested can not access the shared memory. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_attach( + const hsa_amd_ipc_memory_t* handle, size_t len, + uint32_t num_agents, + const hsa_agent_t* mapping_agents, + void** mapped_ptr); + +/** + * @brief Decrements the reference count for the shared memory mapping and + * releases access to shared memory imported with hsa_amd_ipc_memory_attach. + * + * @param[in] mapped_ptr Pointer to the first byte of a shared allocation + * imported with hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with + * hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported + * with hsa_amd_ipc_memory_attach. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/src/inc/hsa_ven_amd_loader.h b/src/inc/hsa_ven_amd_loader.h index 804a360a2..020dd9173 100644 --- a/src/inc/hsa_ven_amd_loader.h +++ b/src/inc/hsa_ven_amd_loader.h @@ -224,6 +224,24 @@ hsa_status_t HSA_API hsa_ven_amd_loader_query_segment_descriptors( hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, size_t *num_segment_descriptors); +/** + * @brief Obtains the handle of executable to which the device address belongs. + * + * @details This method should not be used to obtain executable handle by using + * a host address. The executable returned is expected to be alive until its + * destroyed by the user. + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there + * is no exectuable found for this kernel code object. + */ +hsa_status_t hsa_ven_amd_loader_query_executable( + const void *device_address, + hsa_executable_t *executable); + /** * @brief Extension version. */ @@ -240,6 +258,10 @@ typedef struct hsa_ven_amd_loader_1_00_pfn_s { hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); } hsa_ven_amd_loader_1_00_pfn_t; #ifdef __cplusplus diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp index fb36d6234..b3dc1f5fb 100644 --- a/src/libamdhsacode/amd_elf_image.cpp +++ b/src/libamdhsacode/amd_elf_image.cpp @@ -423,9 +423,9 @@ namespace amd { RelocationSection* relocationSection(SymbolTable* symtab = 0) override; Segment* segment() override { return seg; } RelocationSection* asRelocationSection() override { return 0; } - bool setMemSize(uint64_t s) { memsize_ = s; return true; } + bool setMemSize(uint64_t s) override { memsize_ = s; return true; } uint64_t memSize() const override { return memsize_ ? memsize_ : size(); } - bool setAlign(uint64_t a) { align_ = a; return true; } + bool setAlign(uint64_t a) override { align_ = a; return true; } uint64_t memAlign() const override { return align_ ? align_ : addralign(); } protected: @@ -474,7 +474,7 @@ namespace amd { bool push(const char* name, uint32_t shtype, uint64_t shflags); bool pullData() override; const char* addString(const std::string& s) override; - size_t addString1(const std::string& s); + size_t addString1(const std::string& s) override; const char* getString(size_t ndx) override; size_t getStringIndex(const char* name) override; @@ -510,12 +510,12 @@ namespace amd { uint32_t index() override { return eindex / sizeof(GElf_Rela); } uint32_t type() override { return GELF_ST_TYPE(Sym()->st_info); } - uint32_t binding() { return GELF_ST_BIND(Sym()->st_info); } - uint64_t size() { return Sym()->st_size; } - uint64_t value() { return Sym()->st_value; } - unsigned char other() { return Sym()->st_other; } + uint32_t binding() override { return GELF_ST_BIND(Sym()->st_info); } + uint64_t size() override { return Sym()->st_size; } + uint64_t value() override { return Sym()->st_value; } + unsigned char other() override { return Sym()->st_other; } std::string name() override; - Section* section(); + Section* section() override; void setValue(uint64_t value) override { Sym()->st_value = value; } void setSize(uint64_t size) override { Sym()->st_size = size; } @@ -665,8 +665,8 @@ namespace amd { bool initNew(uint16_t machine, uint16_t type, uint8_t os_abi = 0, uint8_t abi_version = 0, uint32_t e_flags = 0) override; bool loadFromFile(const std::string& filename) override; bool saveToFile(const std::string& filename) override; - bool initFromBuffer(const void* buffer, size_t size); - bool initAsBuffer(const void* buffer, size_t size); + bool initFromBuffer(const void* buffer, size_t size) override; + bool initAsBuffer(const void* buffer, size_t size) override; bool close(); bool writeTo(const std::string& filename) override; bool copyToBuffer(void** buf, size_t* size = 0) override; @@ -683,9 +683,9 @@ namespace amd { uint16_t Machine() override { return ehdr.e_machine; } uint16_t Type() override { return ehdr.e_type; } - GElfStringTable* shstrtab(); - GElfStringTable* strtab(); - GElfSymbolTable* getSymtab(uint16_t index) + GElfStringTable* shstrtab() override; + GElfStringTable* strtab() override; + GElfSymbolTable* getSymtab(uint16_t index) override { return static_cast(section(index)); } @@ -694,7 +694,7 @@ namespace amd { GElfStringTable* getStringTable(uint16_t index) override; GElfSymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) override; - GElfSymbolTable* symtab(); + GElfSymbolTable* symtab() override; GElfSegment* segment(size_t i) override { return segments[i].get(); } Segment* segmentByVAddr(uint64_t vaddr) override; diff --git a/src/libamdhsacode/amd_options.hpp b/src/libamdhsacode/amd_options.hpp index c7255abd1..0af42f8c7 100644 --- a/src/libamdhsacode/amd_options.hpp +++ b/src/libamdhsacode/amd_options.hpp @@ -283,7 +283,7 @@ class ValueOption final: public OptionBase { /// @brief Not copy-assignable. ValueOption& operator=(const ValueOption &o); - bool ProcessTokens(std::list &tokens); + bool ProcessTokens(std::list &tokens) override; T value_; }; @@ -340,7 +340,7 @@ class ChoiceOption final: public OptionBase { /// @brief Not copy-assignable. ChoiceOption& operator =(const ChoiceOption&); - bool ProcessTokens(std::list &tokens); + bool ProcessTokens(std::list &tokens) override; std::unordered_set choices_; std::string value_; @@ -370,7 +370,7 @@ class NoArgOption final: public OptionBase { /// @brief Not copy-assignable. NoArgOption& operator=(const NoArgOption &o); - bool ProcessTokens(std::list &tokens) { + bool ProcessTokens(std::list &tokens) override { assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); if (1 == tokens.size()) { tokens.pop_front(); diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index ea457c1c7..c1465da94 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -529,15 +529,15 @@ bool VariableSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { return true; } -bool LoadedCodeObjectImpl::GetInfo(hsa_loaded_code_object_info_t attribute, void *value) +bool LoadedCodeObjectImpl::GetInfo(amd_loaded_code_object_info_t attribute, void *value) { assert(value); switch (attribute) { - case HSA_LOADED_CODE_OBJECT_INFO_ELF_IMAGE: + case AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE: ((hsa_code_object_t*)value)->handle = reinterpret_cast(elf_data); break; - case HSA_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE: + case AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE: *((size_t*)value) = elf_size; break; default: { @@ -729,7 +729,7 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable( return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; } - agent_symbols_.insert( + auto insert_status = agent_symbols_.insert( std::make_pair(std::make_pair(std::string(name), agent), new VariableSymbol(true, std::string(name), @@ -742,6 +742,9 @@ hsa_status_t ExecutableImpl::DefineAgentExternalVariable( false, // TODO: const. true, reinterpret_cast(address)))); + assert(insert_status.second); + insert_status.first->second->agent = agent; + return HSA_STATUS_SUCCESS; } @@ -912,6 +915,25 @@ size_t ExecutableImpl::QuerySegmentDescriptors( return i - first_empty_segment_descriptor; } +hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address) +{ + hsa_executable_t execHandle = {0}; + ReaderLockGuard reader_lock(rw_lock_); + if (device_address == 0) { + return execHandle; + } + + for (auto &exec : executables) { + if (exec != nullptr) { + uint64_t host_address = exec->FindHostAddress(device_address); + if (host_address != 0) { + return Executable::Handle(exec); + } + } + } + return execHandle; +} + uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address) { for (auto &obj : loaded_code_objects) { @@ -985,10 +1007,9 @@ hsa_status_t ExecutableImpl::LoadCodeObject( hsa_agent_t agent, hsa_code_object_t code_object, const char *options, - hsa_loaded_code_object_t *loaded_code_object, - bool load_legacy) + hsa_loaded_code_object_t *loaded_code_object) { - return LoadCodeObject(agent, code_object, 0, options, loaded_code_object, load_legacy); + return LoadCodeObject(agent, code_object, 0, options, loaded_code_object); } hsa_status_t ExecutableImpl::LoadCodeObject( @@ -996,8 +1017,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( hsa_code_object_t code_object, size_t code_object_size, const char *options, - hsa_loaded_code_object_t *loaded_code_object, - bool load_legacy) + hsa_loaded_code_object_t *loaded_code_object) { WriterLockGuard writer_lock(rw_lock_); if (HSA_EXECUTABLE_STATE_FROZEN == state_) { @@ -1015,7 +1035,6 @@ hsa_status_t ExecutableImpl::LoadCodeObject( } code.reset(new code::AmdHsaCode()); - load_legacy_ = load_legacy; if (!code->InitAsHandle(code_object)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; @@ -1072,13 +1091,11 @@ hsa_status_t ExecutableImpl::LoadCodeObject( objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize())); loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back()); - for (size_t i = 0; i < code->DataSegmentCount(); ++i) { - status = LoadSegment(agent, code->DataSegment(i), majorVersion, code->Machine()); - if (status != HSA_STATUS_SUCCESS) { return status; } - } + status = LoadSegments(agent, code.get(), majorVersion); + if (status != HSA_STATUS_SUCCESS) return status; for (size_t i = 0; i < code->SymbolCount(); ++i) { - status = LoadSymbol(agent, code->GetSymbol(i)); + status = LoadSymbol(agent, code->GetSymbol(i), majorVersion); if (status != HSA_STATUS_SUCCESS) { return status; } } @@ -1097,18 +1114,58 @@ hsa_status_t ExecutableImpl::LoadCodeObject( return HSA_STATUS_SUCCESS; } -hsa_status_t ExecutableImpl::LoadSegment(hsa_agent_t agent, code::Segment* s, - uint32_t majorVersion, uint16_t machine) -{ - if (majorVersion >= 2) - return LoadSegmentV2(agent, s, machine); +hsa_status_t ExecutableImpl::LoadSegments(hsa_agent_t agent, + const code::AmdHsaCode *c, + uint32_t majorVersion) { + if (majorVersion < 2) + return LoadSegmentsV1(agent, c); else - return LoadSegmentV1(agent, s); + return LoadSegmentsV2(agent, c); +} + +hsa_status_t ExecutableImpl::LoadSegmentsV1(hsa_agent_t agent, + const code::AmdHsaCode *c) { + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < c->DataSegmentCount(); ++i) { + status = LoadSegmentV1(agent, c->DataSegment(i)); + if (status != HSA_STATUS_SUCCESS) return status; + } + return HSA_STATUS_SUCCESS; } -hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, code::Segment* s) -{ +hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent, + const code::AmdHsaCode *c) { + assert(c->Machine() == EM_AMDGPU && "Program code objects are not supported"); + + if (!c->DataSegmentCount()) return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + + uint64_t vaddr = c->DataSegment(0)->vaddr(); + uint64_t size = c->DataSegment(c->DataSegmentCount() - 1)->vaddr() + + c->DataSegment(c->DataSegmentCount() - 1)->memSize(); + + void *ptr = context_->SegmentAlloc(AMDGPU_HSA_SEGMENT_CODE_AGENT, agent, size, + AMD_ISA_ALIGN_BYTES, true); + if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT, + ptr, size, vaddr, 0); + if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < c->DataSegmentCount(); ++i) { + status = LoadSegmentV2(c->DataSegment(i), load_segment); + if (status != HSA_STATUS_SUCCESS) return status; + } + + objects.push_back(load_segment); + loaded_code_objects.back()->LoadedSegments().push_back(load_segment); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, + const code::Segment *s) { assert(s->type() < PT_LOOS + AMDGPU_HSA_SEGMENT_LAST); if (s->memSize() == 0) return HSA_STATUS_SUCCESS; @@ -1135,20 +1192,33 @@ hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, code::Segment* s) return HSA_STATUS_SUCCESS; } -hsa_status_t ExecutableImpl::LoadSymbol(hsa_agent_t agent, code::Symbol* sym) +hsa_status_t ExecutableImpl::LoadSegmentV2(const code::Segment *data_segment, + loader::Segment *load_segment) { + assert(data_segment && load_segment); + load_segment->Copy(data_segment->vaddr(), data_segment->data(), + data_segment->imageSize()); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSymbol(hsa_agent_t agent, + code::Symbol* sym, + uint32_t majorVersion) { if (sym->IsDeclaration()) { - return LoadDeclarationSymbol(agent, sym); + return LoadDeclarationSymbol(agent, sym, majorVersion); } else { - return LoadDefinitionSymbol(agent, sym); + return LoadDefinitionSymbol(agent, sym, majorVersion); } } -hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, code::Symbol* sym) +hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, + code::Symbol* sym, + uint32_t majorVersion) { bool isAgent = sym->IsAgent(); - if (!load_legacy_) { - isAgent = agent.handle == 0 ? false : true; + if (majorVersion >= 2) { + isAgent = agent.handle != 0; } if (isAgent) { auto agent_symbol = agent_symbols_.find(std::make_pair(sym->Name(), agent)); @@ -1231,6 +1301,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, code::Symbo } assert(symbol); if (isAgent) { + symbol->agent = agent; agent_symbols_.insert(std::make_pair(std::make_pair(sym->Name(), agent), symbol)); } else { program_symbols_.insert(std::make_pair(sym->Name(), symbol)); @@ -1238,7 +1309,9 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, code::Symbo return HSA_STATUS_SUCCESS; } -hsa_status_t ExecutableImpl::LoadDeclarationSymbol(hsa_agent_t agent, code::Symbol* sym) +hsa_status_t ExecutableImpl::LoadDeclarationSymbol(hsa_agent_t agent, + code::Symbol* sym, + uint32_t majorVersion) { auto program_symbol = program_symbols_.find(sym->Name()); if (program_symbol == program_symbols_.end()) { @@ -1578,39 +1651,6 @@ hsa_status_t ExecutableImpl::Freeze(const char *options) { return HSA_STATUS_SUCCESS; } -hsa_status_t ExecutableImpl::LoadSegmentV2(hsa_agent_t agent, code::Segment* s, uint16_t machine) -{ - amdgpu_hsa_elf_segment_t segment; - - if (s->memSize() == 0) - return HSA_STATUS_SUCCESS; - - // FIXME: Should support EM_HSA_VENDOR - if (machine == EM_AMDGPU) { - if (s->flags() & PF_X) - segment = AMDGPU_HSA_SEGMENT_CODE_AGENT; - else if (s->flags() & PF_W) - segment = AMDGPU_HSA_SEGMENT_GLOBAL_AGENT; - else { - assert (s->flags() & PF_R); - segment = AMDGPU_HSA_SEGMENT_READONLY_AGENT; - } - } else { // EM_HSA_SHARED - segment = AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM; - } - - void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); - if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - - Segment *new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset()); - new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); - objects.push_back(new_seg); - assert(new_seg); - - loaded_code_objects.back()->LoadedSegments().push_back(new_seg); - return HSA_STATUS_SUCCESS; -} - void ExecutableImpl::Print(std::ostream& out) { out << "AMD Executable" << std::endl; diff --git a/src/loader/executable.hpp b/src/loader/executable.hpp index 8f3c15307..ff897f332 100644 --- a/src/loader/executable.hpp +++ b/src/loader/executable.hpp @@ -112,7 +112,7 @@ class SymbolImpl: public Symbol { , is_definition(_is_definition) , address(_address) {} - virtual bool GetInfo(hsa_symbol_info32_t symbol_info, void *value); + virtual bool GetInfo(hsa_symbol_info32_t symbol_info, void *value) override; private: SymbolImpl(const SymbolImpl &s); @@ -257,7 +257,7 @@ class LoadedCodeObjectImpl : public LoadedCodeObject, public ExecutableObject { size_t ElfSize() const { return elf_size; } std::vector& LoadedSegments() { return loaded_segments; } - bool GetInfo(hsa_loaded_code_object_info_t attribute, void *value) override; + bool GetInfo(amd_loaded_code_object_info_t attribute, void *value) override; hsa_status_t IterateLoadedSegments( hsa_status_t (*callback)( @@ -361,35 +361,33 @@ class ExecutableImpl final: public Executable { ~ExecutableImpl(); - hsa_status_t GetInfo(hsa_executable_info_t executable_info, void *value); + hsa_status_t GetInfo(hsa_executable_info_t executable_info, void *value) override; hsa_status_t DefineProgramExternalVariable( - const char *name, void *address); + const char *name, void *address) override; hsa_status_t DefineAgentExternalVariable( const char *name, hsa_agent_t agent, hsa_variable_segment_t segment, - void *address); + void *address) override; hsa_status_t LoadCodeObject( hsa_agent_t agent, hsa_code_object_t code_object, const char *options, - hsa_loaded_code_object_t *loaded_code_object, - bool load_legacy = true); + hsa_loaded_code_object_t *loaded_code_object) override; hsa_status_t LoadCodeObject( hsa_agent_t agent, hsa_code_object_t code_object, size_t code_object_size, const char *options, - hsa_loaded_code_object_t *loaded_code_object, - bool load_legacy = true); + hsa_loaded_code_object_t *loaded_code_object) override; - hsa_status_t Freeze(const char *options); + hsa_status_t Freeze(const char *options) override; - hsa_status_t Validate(uint32_t *result) { + hsa_status_t Validate(uint32_t *result) override { amd::hsa::common::ReaderLockGuard reader_lock(rw_lock_); assert(result); *result = 0; @@ -405,7 +403,7 @@ class ExecutableImpl final: public Executable { const hsa_agent_t *agent) override; hsa_status_t IterateSymbols( - iterate_symbols_f callback, void *data); + iterate_symbols_f callback, void *data) override; /// @since hsa v1.1. hsa_status_t IterateAgentSymbols( @@ -427,7 +425,7 @@ class ExecutableImpl final: public Executable { hsa_status_t (*callback)( hsa_loaded_code_object_t loaded_code_object, void *data), - void *data); + void *data) override; size_t GetNumSegmentDescriptors() override; @@ -452,18 +450,22 @@ class ExecutableImpl final: public Executable { ExecutableImpl& operator=(const ExecutableImpl &e); std::unique_ptr code; - bool load_legacy_; Symbol* GetSymbolInternal( const char *symbol_name, const hsa_agent_t *agent); - hsa_status_t LoadSegment(hsa_agent_t agent, code::Segment* s, uint32_t majorVersion, uint16_t machine); - hsa_status_t LoadSegmentV1(hsa_agent_t agent, amd::hsa::code::Segment* seg); - hsa_status_t LoadSegmentV2(hsa_agent_t agent, amd::hsa::code::Segment* seg, uint16_t machine); - hsa_status_t LoadSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym); - hsa_status_t LoadDefinitionSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym); - hsa_status_t LoadDeclarationSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + hsa_status_t LoadSegments(hsa_agent_t agent, const code::AmdHsaCode *c, + uint32_t majorVersion); + hsa_status_t LoadSegmentsV1(hsa_agent_t agent, const code::AmdHsaCode *c); + hsa_status_t LoadSegmentsV2(hsa_agent_t agent, const code::AmdHsaCode *c); + hsa_status_t LoadSegmentV1(hsa_agent_t agent, const code::Segment *s); + hsa_status_t LoadSegmentV2(const code::Segment *data_segment, + loader::Segment *load_segment); + + hsa_status_t LoadSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym, uint32_t majorVersion); + hsa_status_t LoadDefinitionSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym, uint32_t majorVersion); + hsa_status_t LoadDeclarationSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym, uint32_t majorVersion); hsa_status_t ApplyRelocations(hsa_agent_t agent, amd::hsa::code::AmdHsaCode *c); hsa_status_t ApplyStaticRelocationSection(hsa_agent_t agent, amd::hsa::code::RelocationSection* sec); @@ -501,7 +503,7 @@ class AmdHsaCodeLoader : public Loader { AmdHsaCodeLoader(Context* context_) : context(context_) { assert(context); } - Context* GetContext() const { return context; } + Context* GetContext() const override { return context; } Executable* CreateExecutable( hsa_profile_t profile, @@ -520,6 +522,8 @@ class AmdHsaCodeLoader : public Loader { hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, size_t *num_segment_descriptors) override; + hsa_executable_t FindExecutable(uint64_t device_address) override; + uint64_t FindHostAddress(uint64_t device_address) override; void EnableReadOnlyMode(); diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp index 85a9ed2ec..81e13fbc6 100644 --- a/src/loader/loaders.hpp +++ b/src/loader/loaders.hpp @@ -83,25 +83,25 @@ namespace loader { bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; - bool ImageExtensionSupported(); + bool ImageExtensionSupported() override; hsa_status_t ImageCreate( hsa_agent_t agent, hsa_access_permission_t image_permission, const hsa_ext_image_descriptor_t *image_descriptor, const void *image_data, - hsa_ext_image_t *image_handle); + hsa_ext_image_t *image_handle) override; hsa_status_t ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle); + hsa_agent_t agent, hsa_ext_image_t image_handle) override; hsa_status_t SamplerCreate( hsa_agent_t agent, const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle); + hsa_ext_sampler_t *sampler_handle) override; hsa_status_t SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle); + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; }; } }