From 36f9c49c3922634ae045340f2c3c7452a7198e62 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Wed, 9 May 2018 12:57:52 -0500 Subject: [PATCH] ROCm 1.8.0 updates --- src/core/inc/amd_aql_queue.h | 163 +------------- src/core/inc/amd_blit_kernel.h | 12 +- src/core/inc/amd_blit_sdma.h | 59 +++-- src/core/inc/amd_elf_image.hpp | 1 + src/core/inc/amd_gpu_agent.h | 56 +++-- src/core/inc/amd_hsa_code.hpp | 1 + src/core/inc/blit.h | 11 +- src/core/inc/intercept_queue.h | 148 +------------ src/core/inc/isa.h | 35 ++- src/core/inc/queue.h | 6 +- src/core/inc/runtime.h | 2 +- src/core/inc/signal.h | 172 +++++++++++++++ src/core/runtime/amd_aql_queue.cpp | 261 +++++++++++++--------- src/core/runtime/amd_blit_kernel.cpp | 57 +---- src/core/runtime/amd_blit_sdma.cpp | 156 ++++++++++++-- src/core/runtime/amd_gpu_agent.cpp | 262 ++++++++++++++-------- src/core/runtime/amd_memory_region.cpp | 23 +- src/core/runtime/hsa.cpp | 101 ++++++++- src/core/runtime/hsa_ext_amd.cpp | 10 + src/core/runtime/intercept_queue.cpp | 2 +- src/core/runtime/isa.cpp | 48 +++-- src/core/runtime/runtime.cpp | 76 ++++++- src/core/util/flag.h | 16 +- src/core/util/lazy_ptr.h | 125 +++++++++++ src/core/util/small_heap.cpp | 185 ++++++++-------- src/core/util/small_heap.h | 81 ++++--- src/inc/amd_hsa_common.h | 8 +- src/inc/amd_hsa_elf.h | 5 + src/inc/amd_hsa_queue.h | 3 +- src/inc/hsa.h | 287 +++++++++++++------------ src/inc/hsa_ext_amd.h | 11 +- src/libamdhsacode/amd_elf_image.cpp | 1 + src/loader/executable.cpp | 94 +++++++- src/loader/loaders.cpp | 26 +-- src/loader/loaders.hpp | 6 +- 35 files changed, 1517 insertions(+), 993 deletions(-) create mode 100644 src/core/util/lazy_ptr.h mode change 100755 => 100644 src/inc/hsa_ext_amd.h mode change 100755 => 100644 src/loader/loaders.cpp diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h index 104a0272c..795928bfb 100644 --- a/src/core/inc/amd_aql_queue.h +++ b/src/core/inc/amd_aql_queue.h @@ -53,7 +53,7 @@ namespace amd { /// @brief Encapsulates HW Aql Command Processor functionality. It /// provide the interface for things such as Doorbell register, read, /// write pointers and a buffer. -class AqlQueue : public core::Queue, private core::LocalSignal, public core::Signal { +class AqlQueue : public core::Queue, private core::LocalSignal, public core::DoorbellSignal { public: static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id_); @@ -183,164 +183,15 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Sig // @brief Submits a block of PM4 and waits until it has been executed. void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override; - /// @brief This operation is illegal - hsa_signal_value_t LoadRelaxed() override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t LoadAcquire() override { - assert(false); - return 0; - } - /// @brief Update signal value using Relaxed semantics void StoreRelaxed(hsa_signal_value_t value) override; /// @brief Update signal value using Release semantics void StoreRelease(hsa_signal_value_t value) override; - /// @brief This operation is illegal - hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, - uint64_t timeout, hsa_wait_state_t wait_hint) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, - uint64_t timeout, hsa_wait_state_t wait_hint) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - void AndRelaxed(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AndAcquire(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AndRelease(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AndAcqRel(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void OrRelaxed(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void OrAcquire(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void OrRelease(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void OrAcqRel(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void XorRelaxed(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void XorAcquire(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void XorRelease(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void XorAcqRel(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AddRelaxed(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AddAcquire(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AddRelease(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void AddAcqRel(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void SubRelaxed(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void SubAcquire(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void SubRelease(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - void SubAcqRel(hsa_signal_value_t value) override { assert(false); } - - /// @brief This operation is illegal - hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t ExchRelease(hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasRelease(hsa_signal_value_t expected, hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, hsa_signal_value_t value) override { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t* ValueLocation() const override { - assert(false); - return NULL; - } - - /// @brief This operation is illegal - HsaEvent* EopEvent() override { - assert(false); - return NULL; - } - protected: bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; } - /// @brief Disallow destroying doorbell apart from its queue. - void doDestroySignal() override { assert(false); } - private: uint32_t ComputeRingBufferMinPkts(); uint32_t ComputeRingBufferMaxPkts(); @@ -353,12 +204,16 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Sig void CloseRingBufferFD(const char* ring_buf_shm_path, int fd) const; int CreateRingBufferFD(const char* ring_buf_shm_path, uint32_t ring_buf_phys_size_bytes) const; - static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg); - /// @brief Define the Scratch Buffer Descriptor and related parameters /// that enable kernel access scratch memory void InitScratchSRD(); + /// @brief Halt the queue without destroying it or fencing memory. + void Suspend(); + + /// @brief Handler for hardware queue events. + static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg); + // AQL packet ring buffer void* ring_buf_; @@ -395,6 +250,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Sig uint32_t pm4_ib_size_b_; KernelMutex pm4_ib_mutex_; + // Error handler control variable. + std::atomic dynamicScratchState; + enum { ERROR_HANDLER_DONE = 1, ERROR_HANDLER_TERMINATE = 2, ERROR_HANDLER_SCRATCH_RETRY = 4 }; + // Shared event used for queue errors static HsaEvent* queue_event_; diff --git a/src/core/inc/amd_blit_kernel.h b/src/core/inc/amd_blit_kernel.h index 9d7090940..6ebc43640 100644 --- a/src/core/inc/amd_blit_kernel.h +++ b/src/core/inc/amd_blit_kernel.h @@ -76,24 +76,26 @@ class BlitKernel : public core::Blit { /// @brief Submit an AQL packet to perform vector copy. The call is blocking /// until the command execution is finished. /// + /// @param p2p true if it is a peer-to-peer copy /// @param dst Memory address of the copy destination. /// @param src Memory address of the copy source. /// @param size Size of the data to be copied. - virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, - size_t size) override; + virtual hsa_status_t SubmitLinearCopyCommand(bool p2p, void* dst, + const void* src, size_t size) override; /// @brief Submit a linear copy command to the the underlying compute device's /// control block. The call is non blocking. The memory transfer will start /// after all dependent signals are satisfied. After the transfer is /// completed, the out signal will be decremented. /// + /// @param p2p true if it is a peer-to-peer copy /// @param dst Memory address of the copy destination. /// @param src Memory address of the copy source. /// @param size Size of the data to be copied. /// @param dep_signals Arrays of dependent signal. /// @param out_signal Output signal. virtual hsa_status_t SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, + bool p2p, void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) override; @@ -152,10 +154,6 @@ class BlitKernel : public core::Blit { /// packet processor doesn't get invalid packet. void ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet); - /// Wait until all packets are finished. - hsa_status_t FenceRelease(uint64_t write_index, uint32_t num_copy_packet, - hsa_fence_scope_t fence); - void PopulateQueue(uint64_t index, uint64_t code_handle, void* args, uint32_t grid_size_x, hsa_signal_t completion_signal); diff --git a/src/core/inc/amd_blit_sdma.h b/src/core/inc/amd_blit_sdma.h index fb0eb1abf..95fed4da4 100644 --- a/src/core/inc/amd_blit_sdma.h +++ b/src/core/inc/amd_blit_sdma.h @@ -61,6 +61,7 @@ class BlitSdmaBase : public core::Blit { static const size_t kCopyPacketSize; static const size_t kMaxSingleCopySize; static const size_t kMaxSingleFillSize; + virtual bool isSDMA() const override { return true; } }; // RingIndexTy: 32/64-bit monotonic ring index, counting in bytes. @@ -69,7 +70,7 @@ class BlitSdmaBase : public core::Blit { template class BlitSdma : public BlitSdmaBase { public: - explicit BlitSdma(); + explicit BlitSdma(bool copy_direction); virtual ~BlitSdma() override; @@ -94,24 +95,26 @@ class BlitSdma : public BlitSdmaBase { /// @brief Submit a linear copy command to the queue buffer. /// + /// @param p2p true if it is a peer-to-peer copy /// @param dst Memory address of the copy destination. /// @param src Memory address of the copy source. /// @param size Size of the data to be copied. - virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, - size_t size) override; + virtual hsa_status_t SubmitLinearCopyCommand(bool p2p, void* dst, + const void* src, size_t size) override; /// @brief Submit a linear copy command to the the underlying compute device's /// control block. The call is non blocking. The memory transfer will start /// after all dependent signals are satisfied. After the transfer is /// completed, the out signal will be decremented. /// + /// @param p2p true if it is a peer-to-peer copy /// @param dst Memory address of the copy destination. /// @param src Memory address of the copy source. /// @param size Size of the data to be copied. /// @param dep_signals Arrays of dependent signal. /// @param out_signal Output signal. virtual hsa_status_t SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, + bool p2p, void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) override; @@ -125,7 +128,7 @@ class BlitSdma : public BlitSdmaBase { virtual hsa_status_t EnableProfiling(bool enable) override; - protected: + private: /// @brief Acquires the address into queue buffer where a new command /// packet of specified size could be written. The address that is /// returned is guaranteed to be unique even in a multi-threaded access @@ -170,6 +173,9 @@ class BlitSdma : public BlitSdmaBase { void BuildFenceCommand(char* fence_command_addr, uint32_t* fence, uint32_t fence_value); + /// @brief Build Hdp Flush command + void BuildHdpFlushCommand(char* cmd_addr); + uint32_t* ObtainFenceObject(); void WaitFence(uint32_t* fence, uint32_t fence_value); @@ -204,19 +210,25 @@ class BlitSdma : public BlitSdmaBase { RingIndexTy cached_reserve_index_; RingIndexTy cached_commit_index_; - uint32_t linear_copy_command_size_; + static const uint32_t linear_copy_command_size_; + + static const uint32_t fill_command_size_; + + static const uint32_t fence_command_size_; - uint32_t fill_command_size_; + static const uint32_t poll_command_size_; - uint32_t fence_command_size_; + static const uint32_t flush_command_size_; - uint32_t poll_command_size_; + static const uint32_t atomic_command_size_; - uint32_t atomic_command_size_; + static const uint32_t timestamp_command_size_; - uint32_t timestamp_command_size_; + static const uint32_t trap_command_size_; - uint32_t trap_command_size_; + // Flag to indicate if sDMA queue is used for H2D copy operations + // true if used for H2D operations, false otherwise + const bool sdma_h2d_; // Max copy size of a single linear copy command packet. size_t max_single_linear_copy_size_; @@ -232,19 +244,20 @@ class BlitSdma : public BlitSdmaBase { /// True if platform atomic is supported. bool platform_atomic_support_; + + /// True if sDMA supports HDP flush + bool hdp_flush_support_; }; -class BlitSdmaV2V3 - // Ring indices are 32-bit. - // HW ring indices are not monotonic (wrap at end of ring). - // Count fields of SDMA commands are 0-based. - : public BlitSdma {}; - -class BlitSdmaV4 - // Ring indices are 64-bit. - // HW ring indices are monotonic (do not wrap at end of ring). - // Count fields of SDMA commands are 1-based. - : public BlitSdma {}; +// Ring indices are 32-bit. +// HW ring indices are not monotonic (wrap at end of ring). +// Count fields of SDMA commands are 0-based. +typedef BlitSdma BlitSdmaV2V3; + +// Ring indices are 64-bit. +// HW ring indices are monotonic (do not wrap at end of ring). +// Count fields of SDMA commands are 1-based. +typedef BlitSdma BlitSdmaV4; } // namespace amd diff --git a/src/core/inc/amd_elf_image.hpp b/src/core/inc/amd_elf_image.hpp index 763c5c831..6667c3b36 100644 --- a/src/core/inc/amd_elf_image.hpp +++ b/src/core/inc/amd_elf_image.hpp @@ -207,6 +207,7 @@ namespace amd { virtual uint16_t Machine() = 0; virtual uint16_t Type() = 0; + virtual uint32_t EFlags() = 0; std::string output() { return out.str(); } diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index 58a70a2cb..ff8fb193e 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -56,6 +56,7 @@ #include "core/inc/cache.h" #include "core/util/small_heap.h" #include "core/util/locks.h" +#include "core/util/lazy_ptr.h" namespace amd { class MemoryRegion; @@ -66,6 +67,8 @@ struct ScratchInfo { size_t size; size_t size_per_thread; ptrdiff_t queue_process_offset; + bool large; + bool retry; }; // @brief Interface to represent a GPU agent. @@ -75,10 +78,8 @@ class GpuAgentInt : public core::Agent { GpuAgentInt(uint32_t node_id) : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} - // @brief Initialize DMA queue. - // - // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. - virtual void InitDma() = 0; + // @brief Ensure blits are ready (performance hint). + virtual void PreloadBlits() {} // @brief Initialization hook invoked after tools library has loaded, // to allow tools interception of interface functions. @@ -104,15 +105,15 @@ class GpuAgentInt : public core::Agent { // @brief Carve scratch memory from scratch pool. // - // @param [out] scratch Structure to be populated with the carved memory + // @param [in/out] scratch Structure to be populated with the carved memory // information. virtual void AcquireQueueScratch(ScratchInfo& scratch) = 0; // @brief Release scratch memory back to scratch pool. // - // @param [in] base Address of scratch memory previously acquired with - // call to ::AcquireQueueScratch. - virtual void ReleaseQueueScratch(void* base) = 0; + // @param [in/out] scratch Scratch memory previously acquired with call to + // ::AcquireQueueScratch. + virtual void ReleaseQueueScratch(ScratchInfo& base) = 0; // @brief Translate the kernel start and end dispatch timestamp from agent // domain to host domain. @@ -185,14 +186,16 @@ class GpuAgent : public GpuAgentInt { // @brief GPU agent destructor. ~GpuAgent(); - // @brief Override from core::Agent. - void InitDma() override; + // @brief Ensure blits are ready (performance hint). + void PreloadBlits() override; // @brief Override from core::Agent. hsa_status_t PostToolsInit() override; uint16_t GetMicrocodeVersion() const; + uint16_t GetSdmaMicrocodeVersion() const; + // @brief Assembles SP3 shader source into ISA or AQL code object. // // @param [in] src_sp3 SP3 shader source text representation. @@ -256,7 +259,20 @@ class GpuAgent : public GpuAgentInt { void AcquireQueueScratch(ScratchInfo& scratch) override; // @brief Override from amd::GpuAgentInt. - void ReleaseQueueScratch(void* base) override; + void ReleaseQueueScratch(ScratchInfo& scratch) override; + + // @brief Register signal for notification when scratch may become available. + // @p signal is notified by OR'ing with @p value. + void AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) { + ScopedAcquire lock(&scratch_lock_); + scratch_notifiers_[signal] = value; + } + + // @brief Deregister scratch notification signal. + void RemoveScratchNotifier(hsa_signal_t signal) { + ScopedAcquire lock(&scratch_lock_); + scratch_notifiers_.erase(signal); + } // @brief Override from amd::GpuAgentInt. void TranslateTime(core::Signal* signal, @@ -326,7 +342,7 @@ class GpuAgent : public GpuAgentInt { // @brief Create SDMA blit object. // // @retval NULL if SDMA blit creation and initialization failed. - core::Blit* CreateBlitSdma(); + core::Blit* CreateBlitSdma(bool h2d); // @brief Create Kernel blit object using provided compute queue. // @@ -367,6 +383,12 @@ class GpuAgent : public GpuAgentInt { // @brief Object to manage scratch memory. SmallHeap scratch_pool_; + // @brief Current short duration scratch memory size. + size_t scratch_used_large_; + + // @brief Notifications for scratch release. + std::map scratch_notifiers_; + // @brief Default scratch size per queue. size_t queue_scratch_len_; @@ -376,7 +398,7 @@ class GpuAgent : public GpuAgentInt { // @brief Blit interfaces for each data path. enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount }; - core::Blit* blits_[BlitCount]; + lazy_ptr blits_[BlitCount]; // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { @@ -385,7 +407,7 @@ class GpuAgent : public GpuAgentInt { QueueCount }; - core::Queue* queues_[QueueCount]; + lazy_ptr queues_[QueueCount]; // @brief Mutex to protect the update to coherency type. KernelMutex coherency_lock_; @@ -443,6 +465,9 @@ class GpuAgent : public GpuAgentInt { // @brief Query the driver to get the cache properties. void InitCacheList(); + // @brief Create internal queues and blits. + void InitDma(); + // @brief Initialize memory pool for end timestamp object. // @retval True if the memory pool for end timestamp object is initialized. bool InitEndTsPool(); @@ -453,9 +478,6 @@ class GpuAgent : public GpuAgentInt { // @brief Alternative aperture size. Only on KV. size_t ape1_size_; - // @brief True if blit objects are initialized. - std::atomic blit_initialized_; - // Each end ts is 32 bytes. static const size_t kTsSize = 32; diff --git a/src/core/inc/amd_hsa_code.hpp b/src/core/inc/amd_hsa_code.hpp index a20207914..4d79660e9 100644 --- a/src/core/inc/amd_hsa_code.hpp +++ b/src/core/inc/amd_hsa_code.hpp @@ -267,6 +267,7 @@ namespace code { const amd::elf::Section* HsaText() const { assert(hsatext); return hsatext; } amd::elf::SymbolTable* Symtab() { assert(img); return img->symtab(); } uint16_t Machine() const { return img->Machine(); } + uint32_t EFlags() const { return img->EFlags(); } AmdHsaCode(bool combineDataSegments = true); virtual ~AmdHsaCode(); diff --git a/src/core/inc/blit.h b/src/core/inc/blit.h index 48aebaa64..640551a2f 100644 --- a/src/core/inc/blit.h +++ b/src/core/inc/blit.h @@ -75,24 +75,26 @@ class Blit { /// control block. The call is blocking until the command execution is /// finished. /// + /// @param p2p true if it is a peer-to-peer copy /// @param dst Memory address of the copy destination. /// @param src Memory address of the copy source. /// @param size Size of the data to be copied. - virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, - size_t size) = 0; + virtual hsa_status_t SubmitLinearCopyCommand(bool p2p, void* dst, + const void* src, size_t size) = 0; /// @brief Submit a linear copy command to the the underlying compute device's /// control block. The call is non blocking. The memory transfer will start /// after all dependent signals are satisfied. After the transfer is /// completed, the out signal will be decremented. /// + /// @param p2p true if it is a peer-to-peer copy /// @param dst Memory address of the copy destination. /// @param src Memory address of the copy source. /// @param size Size of the data to be copied. /// @param dep_signals Arrays of dependent signal. /// @param out_signal Output signal. virtual hsa_status_t SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, + bool p2p, void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) = 0; /// @brief Submit a linear fill command to the the underlying compute device's @@ -113,6 +115,9 @@ class Blit { /// @return HSA_STATUS_SUCCESS if the request to enable/disable profiling is /// successful. virtual hsa_status_t EnableProfiling(bool enable) = 0; + + /// @brief Blit operations use SDMA. + virtual bool isSDMA() const { return false; } }; } // namespace core diff --git a/src/core/inc/intercept_queue.h b/src/core/inc/intercept_queue.h index 82ec222fe..3f1cd956e 100644 --- a/src/core/inc/intercept_queue.h +++ b/src/core/inc/intercept_queue.h @@ -183,7 +183,7 @@ class QueueProxy : public QueueWrapper { // @brief Provides packet intercept and rewrite capability for a queue. // Host-side dispatches are processed during doorbell ring. // Device-side dispatches are processed as an asynchronous signal event. -class InterceptQueue : public QueueProxy, private LocalSignal, public Signal { +class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSignal { public: explicit InterceptQueue(std::unique_ptr queue); ~InterceptQueue(); @@ -250,152 +250,6 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public Signal { StoreRelaxed(value); } - /// @brief This operation is illegal - hsa_signal_value_t LoadRelaxed() { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t LoadAcquire() { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, - uint64_t timeout, hsa_wait_state_t wait_hint) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, - uint64_t timeout, hsa_wait_state_t wait_hint) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - void AndRelaxed(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AndAcquire(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AndRelease(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AndAcqRel(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void OrRelaxed(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void OrAcquire(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void OrRelease(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void OrAcqRel(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void XorRelaxed(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void XorAcquire(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void XorRelease(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void XorAcqRel(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AddRelaxed(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AddAcquire(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AddRelease(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void AddAcqRel(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void SubRelaxed(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void SubAcquire(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void SubRelease(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - void SubAcqRel(hsa_signal_value_t value) { assert(false); } - - /// @brief This operation is illegal - hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t ExchRelease(hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasRelease(hsa_signal_value_t expected, hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, hsa_signal_value_t value) { - assert(false); - return 0; - } - - /// @brief This operation is illegal - hsa_signal_value_t* ValueLocation() const { - assert(false); - return NULL; - } - - /// @brief This operation is illegal - HsaEvent* EopEvent() { - assert(false); - return NULL; - } - static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id_); } static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); } diff --git a/src/core/inc/isa.h b/src/core/inc/isa.h index f2c224f2f..1dd6576cc 100644 --- a/src/core/inc/isa.h +++ b/src/core/inc/isa.h @@ -106,18 +106,30 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> { const Version &version() const { return version_; } + /// @returns True if this Isa has xnack enabled, false otherwise. + const bool &xnackEnabled() const { + return xnackEnabled_; + } /// @returns This Isa's supported wavefront. const Wavefront &wavefront() const { return wavefront_; } + /// @returns This Isa's architecture. + std::string GetArchitecture() const { + return "amdgcn"; + } /// @returns This Isa's vendor. std::string GetVendor() const { - return "AMD"; + return "amd"; } - /// @returns This Isa's architecture. - std::string GetArchitecture() const { - return "AMDGPU"; + /// @returns This Isa's OS. + std::string GetOS() const { + return "amdhsa"; + } + /// @returns This Isa's environment. + std::string GetEnvironment() const { + return ""; } /// @returns This Isa's major version. int32_t GetMajorVersion() const { @@ -140,7 +152,8 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> { /// otherwise. bool IsCompatible(const Isa *isa_object) const { assert(isa_object); - return version_ == isa_object->version_; + return version_ == isa_object->version_ && + xnackEnabled_ == isa_object->xnackEnabled_; } /// @returns True if this Isa is compatible with @p isa_handle, false /// otherwise. @@ -168,14 +181,20 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> { private: /// @brief Default constructor. - Isa(): version_(Version(-1, -1, -1)) {} + Isa(): version_(Version(-1, -1, -1)), xnackEnabled_(false) {} /// @brief Construct from @p version. - Isa(const Version &version): version_(version) {} + Isa(const Version &version): version_(version), xnackEnabled_(false) {} + + /// @brief Construct from @p version. + Isa(const Version &version, const bool xnack): version_(version), xnackEnabled_(xnack) {} /// @brief Isa's version. Version version_; + /// @brief Isa's supported xnack flag. + bool xnackEnabled_; + /// @brief Isa's supported wavefront. Wavefront wavefront_; @@ -190,7 +209,7 @@ class IsaRegistry final { /// @returns Isa for requested @p full_name, null pointer if not supported. static const Isa *GetIsa(const std::string &full_name); /// @returns Isa for requested @p version, null pointer if not supported. - static const Isa *GetIsa(const Isa::Version &version); + static const Isa *GetIsa(const Isa::Version &version, bool xnack); private: /// @brief IsaRegistry's map type. diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h index d5fc23209..811206457 100644 --- a/src/core/inc/queue.h +++ b/src/core/inc/queue.h @@ -75,8 +75,7 @@ struct AqlPacket { std::string string() const { std::stringstream string; - uint8_t type = ((dispatch.header >> HSA_PACKET_HEADER_TYPE) & - ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)); + uint8_t type = this->type(); const char* type_names[] = { "HSA_PACKET_TYPE_VENDOR_SPECIFIC", "HSA_PACKET_TYPE_INVALID", @@ -318,10 +317,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { virtual void do_set_public_handle(hsa_queue_t* handle) { public_handle_ = handle; } - hsa_queue_t* public_handle_; virtual bool _IsA(rtti_t id) const = 0; + hsa_queue_t* public_handle_; + private: DISALLOW_COPY_AND_ASSIGN(Queue); }; diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index ac0d4e8d3..d1aed4e26 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -510,7 +510,7 @@ class Runtime { void* vm_fault_handler_user_data_; // Holds reference count to runtime object. - volatile uint32_t ref_count_; + std::atomic ref_count_; // Track environment variables. Flag flag_; diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h index ddc31ebd7..401b2c66e 100644 --- a/src/core/inc/signal.h +++ b/src/core/inc/signal.h @@ -60,6 +60,18 @@ #include "inc/amd_hsa_signal.h" +// Allow hsa_signal_t to be keys in STL structures. +namespace std { +template <> struct less { + __forceinline bool operator()(const hsa_signal_t& x, const hsa_signal_t& y) const { + return x.handle < y.handle; + } + typedef hsa_signal_t first_argument_type; + typedef hsa_signal_t second_argument_type; + typedef bool result_type; +}; +} + namespace core { class Agent; class Signal; @@ -333,6 +345,166 @@ class Signal { DISALLOW_COPY_AND_ASSIGN(Signal); }; +/// @brief Handle signal operations which are not for use on doorbells. +class DoorbellSignal : public Signal { + public: + using Signal::Signal; + + /// @brief This operation is illegal + hsa_signal_value_t LoadRelaxed() final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t LoadAcquire() final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + void AndRelaxed(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AndAcquire(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AndRelease(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AndAcqRel(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void OrRelaxed(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void OrAcquire(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void OrRelease(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void OrAcqRel(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void XorRelaxed(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void XorAcquire(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void XorRelease(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void XorAcqRel(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AddRelaxed(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AddAcquire(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AddRelease(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void AddAcqRel(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void SubRelaxed(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void SubAcquire(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void SubRelease(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + void SubAcqRel(hsa_signal_value_t value) final override { assert(false); } + + /// @brief This operation is illegal + hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t ExchRelease(hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value) final override { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t* ValueLocation() const final override { + assert(false); + return NULL; + } + + /// @brief This operation is illegal + HsaEvent* EopEvent() final override { + assert(false); + return NULL; + } + + protected: + /// @brief Disallow destroying doorbell apart from its queue. + void doDestroySignal() final override { assert(false); } +}; + struct hsa_signal_handle { hsa_signal_t signal; diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index ab8fee67a..cac25fcf4 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -72,7 +72,7 @@ namespace amd { // Queue::amd_queue_ is cache-aligned for performance. const uint32_t kAmdQueueAlignBytes = 0x40; -HsaEvent* AqlQueue::queue_event_ = NULL; +HsaEvent* AqlQueue::queue_event_ = nullptr; std::atomic AqlQueue::queue_count_(0); KernelMutex AqlQueue::queue_lock_; int AqlQueue::rtti_id_ = 0; @@ -81,7 +81,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr core::HsaEventCallback callback, void* err_data, bool is_kv) : Queue(), LocalSignal(0), - Signal(signal()), + DoorbellSignal(signal()), ring_buf_(nullptr), ring_buf_alloc_bytes_(0), queue_id_(HSA_QUEUEID(-1)), @@ -92,7 +92,8 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr errors_data_(err_data), is_kv_queue_(is_kv), pm4_ib_buf_(nullptr), - pm4_ib_size_b_(0x1000) { + pm4_ib_size_b_(0x1000), + dynamicScratchState(0) { // When queue_full_workaround_ is set to 1, the ring buffer is internally // doubled in size. Virtual addresses in the upper half of the ring allocation // are mapped to the same set of pages backing the lower half. @@ -279,17 +280,26 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr } AqlQueue::~AqlQueue() { - Inactivate(); + // Remove error handler synchronously. + // Sequences error handler callbacks with queue destroy. + dynamicScratchState |= ERROR_HANDLER_TERMINATE; + HSA::hsa_signal_store_screlease(amd_queue_.queue_inactive_signal, 0x8000000000000000ull); + while ((dynamicScratchState & ERROR_HANDLER_DONE) != ERROR_HANDLER_DONE) { + HSA::hsa_signal_wait_relaxed(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, + 0x8000000000000000ull, -1ull, HSA_WAIT_STATE_BLOCKED); + HSA::hsa_signal_store_relaxed(amd_queue_.queue_inactive_signal, 0x8000000000000000ull); + } + Inactivate(); + agent_->ReleaseQueueScratch(queue_scratch_); FreeRegisteredRingBuffer(); - agent_->ReleaseQueueScratch(queue_scratch_.queue_base); HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal); if (core::g_use_interrupt_wait) { ScopedAcquire lock(&queue_lock_); queue_count_--; if (queue_count_ == 0) { core::InterruptSignal::DestroyEvent(queue_event_); - queue_event_ = NULL; + queue_event_ = nullptr; } } core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_); @@ -471,7 +481,7 @@ uint32_t AqlQueue::ComputeRingBufferMaxPkts() { } void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) { - if (agent_->profile() == HSA_PROFILE_FULL) { + if ((agent_->profile() == HSA_PROFILE_FULL) && queue_full_workaround_) { // Compute the physical and virtual size of the queue. uint32_t ring_buf_phys_size_bytes = uint32_t(queue_size_pkts * sizeof(core::AqlPacket)); @@ -602,23 +612,22 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) { } else { // Allocate storage for the ring buffer. ring_buf_alloc_bytes_ = AlignUp( - queue_size_pkts * static_cast(sizeof(core::AqlPacket)), 4096); + queue_size_pkts * sizeof(core::AqlPacket), 4096); ring_buf_ = core::Runtime::runtime_singleton_->system_allocator()( - ring_buf_alloc_bytes_, 0x1000, - core::MemoryRegion::AllocateExecutable | - core::MemoryRegion::AllocateDoubleMap); + ring_buf_alloc_bytes_, 0x1000, core::MemoryRegion::AllocateExecutable | + (queue_full_workaround_ ? core::MemoryRegion::AllocateDoubleMap : 0)); assert(ring_buf_ != NULL && "AQL queue memory allocation failure"); // The virtual ring allocation is twice as large as requested. // Each half maps to the same set of physical pages. - ring_buf_alloc_bytes_ *= 2; + if (queue_full_workaround_) ring_buf_alloc_bytes_ *= 2; } } void AqlQueue::FreeRegisteredRingBuffer() { - if (agent_->profile() == HSA_PROFILE_FULL) { + if ((agent_->profile() == HSA_PROFILE_FULL) && queue_full_workaround_) { #ifdef __linux__ munmap(ring_buf_, ring_buf_alloc_bytes_); #endif @@ -676,110 +685,166 @@ int AqlQueue::CreateRingBufferFD(const char* ring_buf_shm_path, #endif } +void AqlQueue::Suspend() { + auto err = hsaKmtUpdateQueue(queue_id_, 0, HSA_QUEUE_PRIORITY_NORMAL, NULL, 0, NULL); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtUpdateQueue failed."); +} + hsa_status_t AqlQueue::Inactivate() { bool active = active_.exchange(false, std::memory_order_relaxed); if (active) { - auto err = hsaKmtDestroyQueue(this->queue_id_); + auto err = hsaKmtDestroyQueue(queue_id_); assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtDestroyQueue failed."); + atomic::Fence(std::memory_order_acquire); } return HSA_STATUS_SUCCESS; } bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { AqlQueue* queue = (AqlQueue*)arg; + hsa_status_t errorCode = HSA_STATUS_SUCCESS; + bool fatal = false; + bool changeWait = false; + hsa_signal_value_t waitVal; + + if ((queue->dynamicScratchState & ERROR_HANDLER_SCRATCH_RETRY) == ERROR_HANDLER_SCRATCH_RETRY) { + queue->dynamicScratchState &= ~ERROR_HANDLER_SCRATCH_RETRY; + queue->agent_->RemoveScratchNotifier(queue->amd_queue_.queue_inactive_signal); + changeWait = true; + waitVal = 0; + HSA::hsa_signal_and_relaxed(queue->amd_queue_.queue_inactive_signal, ~0x8000000000000000ull); + error_code &= ~0x8000000000000000ull; + } - if ((error_code & 1) == 1) { - // Insufficient scratch - recoverable - auto& scratch = queue->queue_scratch_; + // Process errors only if queue is not terminating. + if ((queue->dynamicScratchState & ERROR_HANDLER_TERMINATE) != ERROR_HANDLER_TERMINATE) { + if (error_code == 512) { // Large scratch reclaim + auto& scratch = queue->queue_scratch_; + queue->agent_->ReleaseQueueScratch(scratch); + scratch.queue_base = nullptr; + scratch.size = 0; + scratch.size_per_thread = 0; + scratch.queue_process_offset = 0; + queue->InitScratchSRD(); + + HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0); + // Resumes queue processing. + atomic::Store(&queue->amd_queue_.queue_properties, + queue->amd_queue_.queue_properties & (~AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE), + std::memory_order_release); + atomic::Fence(std::memory_order_release); + return true; + } + + // Process only one queue error. + if (error_code == 1) { + // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present. + auto& scratch = queue->queue_scratch_; + + queue->agent_->ReleaseQueueScratch(scratch); + + uint64_t pkt_slot_idx = + queue->amd_queue_.read_dispatch_id & (queue->amd_queue_.hsa_queue.size - 1); - queue->agent_->ReleaseQueueScratch(scratch.queue_base); + core::AqlPacket& pkt = + ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; - uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size; + uint32_t scratch_request = pkt.dispatch.private_segment_size; - const core::AqlPacket& pkt = - ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx]; + scratch.size_per_thread = scratch_request; + // Align whole waves to 1KB. + scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); + scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * + queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize; - uint32_t scratch_request = pkt.dispatch.private_segment_size; + queue->agent_->AcquireQueueScratch(scratch); - scratch.size_per_thread = - Max(uint32_t(scratch.size_per_thread * 2), scratch_request); - // Align whole waves to 1KB. - scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); - scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * - queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize; + if (scratch.retry) { + queue->agent_->AddScratchNotifier(queue->amd_queue_.queue_inactive_signal, + 0x8000000000000000ull); + queue->dynamicScratchState |= ERROR_HANDLER_SCRATCH_RETRY; + changeWait = true; + waitVal = error_code; + } else { + // Out of scratch - promote error + if (scratch.queue_base == nullptr) { + errorCode = HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } else { + // Mark large scratch allocation for single use. + if (scratch.large) { + queue->amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE; + // Set system release fence to flush scratch stores with older firmware versions. + if ((queue->agent_->isa()->GetMajorVersion() == 8) && + (queue->agent_->GetMicrocodeVersion() < 729)) { + pkt.dispatch.header &= ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1) + << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + pkt.dispatch.header |= + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + } + } + // Reset scratch memory related entities for the queue + queue->InitScratchSRD(); + // Restart the queue. + HSA::hsa_signal_store_screlease(queue->amd_queue_.queue_inactive_signal, 0); + } + } - queue->agent_->AcquireQueueScratch(scratch); - if (scratch.queue_base == NULL) { - // Out of scratch - promote error and invalidate queue - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_OUT_OF_RESOURCES, - queue->public_handle(), queue->errors_data_); - return false; + } else if ((error_code & 2) == 2) { // Invalid dim + errorCode = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + + } else if ((error_code & 4) == 4) { // Invalid group memory + errorCode = HSA_STATUS_ERROR_INVALID_ALLOCATION; + + } else if ((error_code & 8) == 8) { // Invalid (or NULL) code + errorCode = HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + + } else if (((error_code & 32) == 32) || // Invalid format: 32 is generic, + ((error_code & 256) == 256)) { // 256 is vendor specific packets + errorCode = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; + + } else if ((error_code & 64) == 64) { // Group is too large + errorCode = HSA_STATUS_ERROR_INVALID_ARGUMENT; + + } else if ((error_code & 128) == 128) { // Out of VGPRs + errorCode = HSA_STATUS_ERROR_INVALID_ISA; + + } else if ((error_code & 0x80000000) == 0x80000000) { // Debug trap + errorCode = HSA_STATUS_ERROR_EXCEPTION; + fatal = true; + + } else { // Undefined code + assert(false && "Undefined queue error code"); + errorCode = HSA_STATUS_ERROR; + fatal = true; } - // Reset scratch memory related entities for the queue - queue->InitScratchSRD(); - - } else if ((error_code & 2) == 2) { // Invalid dim - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, - queue->public_handle(), queue->errors_data_); - return false; - - } else if ((error_code & 4) == 4) { // Invalid group memory - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ALLOCATION, - queue->public_handle(), queue->errors_data_); - return false; - - } else if ((error_code & 8) == 8) { // Invalid (or NULL) code - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_CODE_OBJECT, - queue->public_handle(), queue->errors_data_); - return false; - - } else if (((error_code & 32) == 32) || - ((error_code & 256) == 256)) { // Invalid format: 32 is generic, - // 256 is vendor specific packets - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_PACKET_FORMAT, - queue->public_handle(), queue->errors_data_); - return false; - } else if ((error_code & 64) == 64) { // Group is too large - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ARGUMENT, - queue->public_handle(), queue->errors_data_); - return false; - } else if ((error_code & 128) == 128) { // Out of VGPRs - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ISA, - queue->public_handle(), queue->errors_data_); - return false; - } else if ((error_code & 0x80000000) == 0x80000000) { // Debug trap - queue->Inactivate(); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR_EXCEPTION, - queue->public_handle(), queue->errors_data_); - return false; - } else { - // Undefined code - queue->Inactivate(); - assert(false && "Undefined queue error code"); - if (queue->errors_callback_ != NULL) - queue->errors_callback_(HSA_STATUS_ERROR, queue->public_handle(), - queue->errors_data_); - return false; - } + if (errorCode == HSA_STATUS_SUCCESS) { + if (changeWait) { + core::Runtime::runtime_singleton_->SetAsyncSignalHandler( + queue->amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, waitVal, + DynamicScratchHandler, queue); + return false; + } + return true; + } - HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0); - return true; + queue->Suspend(); + if (queue->errors_callback_ != nullptr) { + queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_); + } + if (fatal) { + // Temporarilly removed until there is clarity on exactly what debugtrap's semantics are. + // assert(false && "Fatal queue error"); + // std::abort(); + } + } + // Copy here is to protect against queue being released between setting the scratch state and + // updating the signal value. The signal itself is safe to use because it is ref counted rather + // than being released with the queue. + hsa_signal_t signal = queue->amd_queue_.queue_inactive_signal; + queue->dynamicScratchState = ERROR_HANDLER_DONE; + HSA::hsa_signal_store_screlease(signal, -1ull); + return false; } hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count, @@ -800,7 +865,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { // Obtain a queue slot for a single AQL packet. uint64_t write_idx = queue->AddWriteIndexAcqRel(1); - while ((write_idx - queue->LoadReadIndexRelaxed()) > queue->amd_queue_.hsa_queue.size) { + while ((write_idx - queue->LoadReadIndexRelaxed()) >= queue->amd_queue_.hsa_queue.size) { os::YieldThread(); } diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp index b9941d69e..a4b77fa09 100644 --- a/src/core/runtime/amd_blit_kernel.cpp +++ b/src/core/runtime/amd_blit_kernel.cpp @@ -587,8 +587,8 @@ hsa_status_t BlitKernel::Destroy(const core::Agent& agent) { return HSA_STATUS_SUCCESS; } -hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, - size_t size) { +hsa_status_t BlitKernel::SubmitLinearCopyCommand(bool p2p, void* dst, + const void* src, size_t size) { // Protect completion_signal_. std::lock_guard guard(lock_); @@ -597,7 +597,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, std::vector dep_signals(0); hsa_status_t stat = SubmitLinearCopyCommand( - dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_)); + p2p, dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_)); if (stat != HSA_STATUS_SUCCESS) { return stat; @@ -614,7 +614,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, } hsa_status_t BlitKernel::SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, + bool p2p, void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) { // Reserve write index for barrier(s) + dispatch packet. const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5); @@ -624,6 +624,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand( uint64_t write_index_temp = write_index; // Insert barrier packets to handle dependent signals. + // Barrier bit keeps signal checking traffic from competing with a copy. const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | @@ -775,8 +776,7 @@ hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value, } hsa_status_t BlitKernel::EnableProfiling(bool enable) { - AMD_HSA_BITS_SET(queue_->amd_queue_.queue_properties, - AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, enable); + queue_->SetProfiling(enable); return HSA_STATUS_SUCCESS; } @@ -799,51 +799,6 @@ void BlitKernel::ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet) { doorbell->StoreRelease(write_index + num_packet - 1); } -hsa_status_t BlitKernel::FenceRelease(uint64_t write_index, - uint32_t num_copy_packet, - hsa_fence_scope_t fence) { - // This function is not thread safe. - - const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | - (fence << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); - - hsa_barrier_and_packet_t packet = {0}; - packet.header = kInvalidPacketHeader; - - HSA::hsa_signal_store_relaxed(completion_signal_, 1); - packet.completion_signal = completion_signal_; - - if (num_copy_packet == 0) { - assert(write_index == 0); - // Reserve write index. - write_index = AcquireWriteIndex(1); - } - - // Populate queue buffer with AQL packet. - hsa_barrier_and_packet_t* queue_buffer = - reinterpret_cast( - queue_->public_handle()->base_address); - std::atomic_thread_fence(std::memory_order_acquire); - queue_buffer[(write_index + num_copy_packet) & queue_bitmask_] = packet; - std::atomic_thread_fence(std::memory_order_release); - queue_buffer[(write_index + num_copy_packet) & queue_bitmask_].header = - kBarrierPacketHeader; - - // Launch packet. - ReleaseWriteIndex(write_index, num_copy_packet + 1); - - // Wait for the packet to finish. - if (HSA::hsa_signal_wait_scacquire(packet.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE) != 0) { - // Signal wait returned unexpected value. - return HSA_STATUS_ERROR; - } - - return HSA_STATUS_SUCCESS; -} - void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args, uint32_t grid_size_x, hsa_signal_t completion_signal) { diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp index 7daf92ba8..47218ec67 100644 --- a/src/core/runtime/amd_blit_sdma.cpp +++ b/src/core/runtime/amd_blit_sdma.cpp @@ -360,6 +360,20 @@ typedef struct SDMA_PKT_TRAP_TAG { } INT_CONTEXT_UNION; } SDMA_PKT_TRAP; +// Initialize Hdp flush packet for use on sDMA of devices +// from Gfx9 or new family +static const SDMA_PKT_POLL_REGMEM hdp_flush_cmd_ { + { SDMA_OP_POLL_REGMEM }, + { 0x00 }, + { 0x80000000 }, + { 0x00 }, + { 0x00 }, + { 0x00 }, +}; + +// Version of sDMA microcode supporting Hdp flush +static const uint16_t sdma_version_ = 0x01A5; + inline uint32_t ptrlow32(const void* p) { return static_cast(reinterpret_cast(p)); } @@ -377,8 +391,33 @@ const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR); const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0; +// Initialize size of various sDMA commands use by this module +template +const uint32_t BlitSdma::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR); + +template +const uint32_t BlitSdma::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL); + +template +const uint32_t BlitSdma::fence_command_size_ = sizeof(SDMA_PKT_FENCE); + +template +const uint32_t BlitSdma::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); + +template +const uint32_t BlitSdma::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); + +template +const uint32_t BlitSdma::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC); + template -BlitSdma::BlitSdma() +const uint32_t BlitSdma::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP); + +template +const uint32_t BlitSdma::trap_command_size_ = sizeof(SDMA_PKT_TRAP); + +template +BlitSdma::BlitSdma(bool copy_direction) : agent_(NULL), queue_start_addr_(NULL), fence_base_addr_(NULL), @@ -386,7 +425,9 @@ BlitSdma::BlitSdma() fence_pool_counter_(0), cached_reserve_index_(0), cached_commit_index_(0), - platform_atomic_support_(true) { + sdma_h2d_(copy_direction), + platform_atomic_support_(true), + hdp_flush_support_(false) { std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } @@ -407,14 +448,6 @@ hsa_status_t BlitSdma::Initial return HSA_STATUS_ERROR; } - linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR); - fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL); - fence_command_size_ = sizeof(SDMA_PKT_FENCE); - poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); - atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC); - timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP); - trap_command_size_ = sizeof(SDMA_PKT_TRAP); - const amd::GpuAgentInt& amd_gpu_agent = static_cast(agent); @@ -423,10 +456,16 @@ hsa_status_t BlitSdma::Initial return HSA_STATUS_ERROR; } - if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) { + if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1) || + amd_gpu_agent.isa()->GetMajorVersion() == 9) { platform_atomic_support_ = false; } + // Determine if sDMA microcode supports HDP flush command + if (agent_->GetSdmaMicrocodeVersion() >= sdma_version_) { + hdp_flush_support_ = true; + } + // Allocate queue buffer. queue_start_addr_ = (char*)core::Runtime::runtime_singleton_->system_allocator()( kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable); @@ -499,7 +538,7 @@ hsa_status_t BlitSdma::Destroy template hsa_status_t BlitSdma::SubmitLinearCopyCommand( - void* dst, const void* src, size_t size) { + bool p2p, void* dst, const void* src, size_t size) { // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; @@ -507,8 +546,16 @@ hsa_status_t BlitSdma::SubmitL const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; + // Add space for acquire or release Hdp flush command + uint32_t flush_cmd_size = 0; + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (p2p)) { + flush_cmd_size = flush_command_size_; + } + } + const uint32_t total_command_size = - total_copy_command_size + fence_command_size_; + total_copy_command_size + fence_command_size_ + flush_cmd_size; const uint32_t kFenceValue = 2015; uint32_t* fence_addr = ObtainFenceObject(); @@ -521,10 +568,25 @@ hsa_status_t BlitSdma::SubmitL return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - BuildCopyCommand(command_addr, num_copy_command, dst, src, size); + // Determine if a Hdp flush cmd is required at the top of cmd stream + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false) && (p2p)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + BuildCopyCommand(command_addr, num_copy_command, dst, src, size); command_addr += total_copy_command_size; + // Determine if a Hdp flush cmd is required at the end of cmd stream + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_) && (p2p)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + BuildFenceCommand(command_addr, fence_addr, kFenceValue); ReleaseWriteAddress(curr_index, total_command_size); @@ -536,7 +598,7 @@ hsa_status_t BlitSdma::SubmitL template hsa_status_t BlitSdma::SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, std::vector& dep_signals, + bool p2p, void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) { // The signal is 64 bit value, and poll checks for 32 bit value. So we // need to use two poll operations per dependent signal. @@ -592,9 +654,17 @@ hsa_status_t BlitSdma::SubmitL ? (fence_command_size_ + trap_command_size_) : 0; + // Add space for acquire or release Hdp flush command + uint32_t flush_cmd_size = 0; + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (p2p)) { + flush_cmd_size = flush_command_size_; + } + } + const uint32_t total_command_size = total_poll_command_size + total_copy_command_size + sync_command_size + - total_timestamp_command_size + interrupt_command_size; + total_timestamp_command_size + interrupt_command_size + flush_cmd_size; RingIndexTy curr_index; char* command_addr = AcquireWriteAddress(total_command_size, curr_index); @@ -620,11 +690,26 @@ hsa_status_t BlitSdma::SubmitL command_addr += timestamp_command_size_; } + // Determine if a Hdp flush cmd is required at the top of cmd stream + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false) && (p2p)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + // Do the transfer after all polls are satisfied. BuildCopyCommand(command_addr, num_copy_command, dst, src, size); - command_addr += total_copy_command_size; + // Determine if a Hdp flush cmd is required at the end of cmd stream + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_) && (p2p)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + if (profiling_enabled) { assert(IsMultipleOf(end_ts_addr, 32)); BuildGetGlobalTimestampCommand(command_addr, @@ -684,8 +769,24 @@ hsa_status_t BlitSdma::SubmitL const uint32_t total_fill_command_size = num_fill_command * fill_command_size_; + // Add space for acquire or release Hdp flush command + uint32_t flush_cmd_size = 0; + + /* + * @note: Commenting this block of code. This is safe since this method + * is never entered. Runtime binds client requests to BlitKernels i.e. + * the Blit object being chosen is blit[dev-to-dev] + */ + /* + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_)) { + flush_cmd_size = flush_command_size_; + } + } + */ + const uint32_t total_command_size = - total_fill_command_size + fence_command_size_; + total_fill_command_size + fence_command_size_ + flush_cmd_size; RingIndexTy curr_index; char* command_addr = AcquireWriteAddress(total_command_size, curr_index); @@ -723,6 +824,17 @@ hsa_status_t BlitSdma::SubmitL assert(cur_size == size); + // Determine if a Hdp flush cmd is required at the end of cmd stream + // @note: Blocked per comments above lines - 776-778 + /* + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + */ + const uint32_t kFenceValue = 2015; uint32_t* fence_addr = ObtainFenceObject(); *fence_addr = 0; @@ -1017,6 +1129,14 @@ void BlitSdma::BuildTrapComman packet_addr->HEADER_UNION.op = SDMA_OP_TRAP; } +template +void BlitSdma::BuildHdpFlushCommand( + char* cmd_addr) { + assert(cmd_addr != NULL); + SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast(cmd_addr); + memcpy(addr, &hdp_flush_cmd_, flush_command_size_); +} + template class BlitSdma; template class BlitSdma; diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 78bede5d2..37a000be4 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -49,6 +49,8 @@ #include #include #include +#include +#include #include "core/inc/amd_aql_queue.h" #include "core/inc/amd_blit_kernel.h" @@ -83,7 +85,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) memory_max_frequency_(0), ape1_base_(0), ape1_size_(0), - blit_initialized_(false), end_ts_pool_size_(0), end_ts_pool_counter_(0), end_ts_base_addr_(NULL) { @@ -97,7 +98,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) // Set instruction set architecture via node property, only on GPU device. isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version( node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor, - node_props.EngineId.ui32.Stepping)); + node_props.EngineId.ui32.Stepping), profile_ == HSA_PROFILE_FULL); // Check if the device is Kaveri, only on GPU device. if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 && @@ -131,17 +132,12 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) GpuAgent::~GpuAgent() { for (int i = 0; i < BlitCount; ++i) { - if (blits_[i] != NULL) { + if (blits_[i] != nullptr) { hsa_status_t status = blits_[i]->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); - delete blits_[i]; } } - for (int i = 0; i < QueueCount; ++i) { - delete queues_[i]; - } - if (end_ts_base_addr_ != NULL) { core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_); } @@ -336,7 +332,6 @@ void GpuAgent::InitScratchPool() { // scratch/thread const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU; - queue_scratch_len_ = 0; queue_scratch_len_ = AlignUp(32 * 64 * num_cu * scratch_per_thread_, 65536); size_t max_scratch_len = queue_scratch_len_ * max_queues_; @@ -358,7 +353,7 @@ void GpuAgent::InitScratchPool() { if (HSAKMT_STATUS_SUCCESS == err) { new (&scratch_pool_) SmallHeap(scratch_base, max_scratch_len); } else { - new (&scratch_pool_) SmallHeap(NULL, 0); + new (&scratch_pool_) SmallHeap(); } } @@ -521,13 +516,13 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() { return queue; } -core::Blit* GpuAgent::CreateBlitSdma() { +core::Blit* GpuAgent::CreateBlitSdma(bool h2d) { core::Blit* sdma; if (isa_->GetMajorVersion() <= 8) { - sdma = new BlitSdmaV2V3; + sdma = new BlitSdmaV2V3(h2d); } else { - sdma = new BlitSdmaV4; + sdma = new BlitSdmaV4(h2d); } if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) { @@ -552,74 +547,97 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { } void GpuAgent::InitDma() { - // This provides the ability to lazy init the blit objects on places that - // could give indication of DMA usage in the future. E.g.: - // 1. Call to allow access API. - // 2. Call to memory lock API. - if (!blit_initialized_.load(std::memory_order_acquire)) { - ScopedAcquire lock(&blit_lock_); - if (!blit_initialized_.load(std::memory_order_relaxed)) { - // Try create SDMA blit first. - // TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified. - if ((isa_->GetMajorVersion() != 8) && - core::Runtime::runtime_singleton_->flag().enable_sdma() && - (HSA_PROFILE_BASE == profile_)) { - blits_[BlitHostToDev] = CreateBlitSdma(); - blits_[BlitDevToHost] = CreateBlitSdma(); - - if (blits_[BlitHostToDev] != NULL && blits_[BlitDevToHost] != NULL) { - blit_initialized_.store(true, std::memory_order_release); - return; - } - } - - // Fall back to blit kernel if SDMA is unavailable. - if (blits_[BlitHostToDev] == NULL) { - // Create a dedicated compute queue for host-to-device blits. - queues_[QueueBlitOnly] = CreateInterceptibleQueue(); - assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed"); + // Setup lazy init pointers on queues and blits. + auto queue_lambda = [this]() { + auto ret = CreateInterceptibleQueue(); + if (ret == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + "Internal queue creation failed."); + return ret; + }; + // Dedicated compute queue for host-to-device blits. + queues_[QueueBlitOnly].reset(queue_lambda); + // Share utility queue with device-to-host blits. + queues_[QueueUtility].reset(queue_lambda); + + // Decide which engine to use for blits. + auto blit_lambda = [this](bool h2d, lazy_ptr& queue) { + std::string sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); + bool use_sdma = (sdma_override.size() == 0) ? (isa_->GetMajorVersion() != 8) : (sdma_override == "1"); + + if (use_sdma && (HSA_PROFILE_BASE == profile_)) { + auto ret = CreateBlitSdma(h2d); + if (ret != nullptr) return ret; + } - blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]); - assert(blits_[BlitHostToDev] != NULL && "Blit creation failed"); - } + auto ret = CreateBlitKernel((*queue).get()); + if (ret == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed."); + return ret; + }; - if (blits_[BlitDevToHost] == NULL) { - // Share utility queue with device-to-host blits. - if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue(); - blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]); - assert(blits_[BlitDevToHost] != NULL && "Blit creation failed"); - } + blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueBlitOnly]); }); + blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); }); + blits_[BlitDevToDev].reset([this]() { + auto ret = CreateBlitKernel((*queues_[QueueUtility]).get()); + if (ret == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed."); + return ret; + }); +} - blit_initialized_.store(true, std::memory_order_release); - } - } +void GpuAgent::PreloadBlits() { + blits_[BlitHostToDev].touch(); + blits_[BlitDevToHost].touch(); + blits_[BlitDevToDev].touch(); } hsa_status_t GpuAgent::PostToolsInit() { // Defer memory allocation until agents have been discovered. InitScratchPool(); BindTrapHandler(); + InitDma(); - // Defer utility queue creation to allow tools to intercept. - if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue(); - - if (queues_[QueueUtility] == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - // Share utility queue with device-to-device blits. - if (blits_[BlitDevToDev] == nullptr) - blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]); + return HSA_STATUS_SUCCESS; +} - if (blits_[BlitDevToDev] == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; +struct DmaDeps_t { + bool p2p; + void* dst; + const void* src; + size_t size; + core::Signal* out_signal; + core::Blit* blit; + std::unique_ptr> deps; +}; + +static bool DmaDeps(hsa_signal_value_t val, void* arg) { + DmaDeps_t* Args = (DmaDeps_t*)arg; + std::vector& deps = *(Args->deps.get()); + if (val != 0) return true; + for (int i = deps.size() - 1; i != 0; i--) { + if (deps[i - 1]->LoadRelaxed() != 0) { + deps.resize(i); + hsa_status_t err = core::Runtime::runtime_singleton_->SetAsyncSignalHandler( + core::Signal::Convert(deps.back()), HSA_SIGNAL_CONDITION_EQ, 0, DmaDeps, arg); + assert(err == HSA_STATUS_SUCCESS && "Failed to update dependency handler."); + return false; + } } - - return HSA_STATUS_SUCCESS; + deps.clear(); + hsa_status_t stat; + do { // Only ready to run copies are on the SDMA queue so if resources are busy they will soon be + // free. + stat = Args->blit->SubmitLinearCopyCommand(Args->p2p, Args->dst, Args->src, Args->size, deps, + *(Args->out_signal)); + } while (stat != HSA_STATUS_SUCCESS); + delete Args; + return false; } hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) { - return blits_[BlitDevToDev]->SubmitLinearCopyCommand(dst, src, size); + // This operation is not a P2P operation - uses BlitKernel + return blits_[BlitDevToDev]->SubmitLinearCopyCommand(false, dst, src, size); } hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, @@ -627,26 +645,42 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, size_t size, std::vector& dep_signals, core::Signal& out_signal) { - core::Blit* blit = - (src_agent.device_type() == core::Agent::kAmdCpuDevice && - dst_agent.device_type() == core::Agent::kAmdGpuDevice) - ? blits_[BlitHostToDev] - : (src_agent.device_type() == core::Agent::kAmdGpuDevice && - dst_agent.device_type() == core::Agent::kAmdCpuDevice) - ? blits_[BlitDevToHost] - : blits_[BlitDevToDev]; - - if (blit == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } + lazy_ptr& blit = + (src_agent.device_type() == core::Agent::kAmdCpuDevice && + dst_agent.device_type() == core::Agent::kAmdGpuDevice) + ? blits_[BlitHostToDev] + : (src_agent.device_type() == core::Agent::kAmdGpuDevice && + dst_agent.device_type() == core::Agent::kAmdCpuDevice) + ? blits_[BlitDevToHost] + : (src_agent.node_id() == dst_agent.node_id()) + ? blits_[BlitDevToDev] : blits_[BlitDevToHost]; if (profiling_enabled()) { // Track the agent so we could translate the resulting timestamp to system // domain correctly. - out_signal.async_copy_agent(this); + out_signal.async_copy_agent(core::Agent::Convert(this->public_handle())); } - hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); + // Determine if this is a Peer-To-Peer copy operation + bool p2p = ((src_agent.node_id() != dst_agent.node_id()) && + (src_agent.device_type() == core::Agent::kAmdGpuDevice) && + (dst_agent.device_type() == core::Agent::kAmdGpuDevice)); + + if ((dep_signals.size() != 0) && blit->isSDMA()) { + DmaDeps_t* Arg = new DmaDeps_t; + Arg->p2p = p2p; + Arg->dst = dst; + Arg->src = src; + Arg->size = size; + Arg->out_signal = &out_signal; + Arg->blit = (*blit).get(); + Arg->deps.reset(new std::vector(std::move(dep_signals))); + hsa_status_t stat = core::Runtime::runtime_singleton_->SetAsyncSignalHandler( + core::Signal::Convert(Arg->deps->back()), HSA_SIGNAL_CONDITION_EQ, 0, DmaDeps, Arg); + return stat; + } + + hsa_status_t stat = blit->SubmitLinearCopyCommand(p2p, dst, src, size, dep_signals, out_signal); return stat; } @@ -915,8 +949,9 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU; scratch.size = scratch.size_per_thread * 32 * 64 * num_cu; scratch.queue_base = nullptr; + scratch.queue_process_offset = 0; - MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueScratch(scratch.queue_base); }); + MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueScratch(scratch); }); if (scratch.size != 0) { AcquireQueueScratch(scratch); @@ -925,6 +960,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, } } + // Ensure utility queue has been created. + // Deferring longer risks exhausting queue count before ISA upload and invalidation capability is + // ensured. + queues_[QueueUtility].touch(); + // Create an HW AQL queue *queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_); scratchGuard.Dismiss(); @@ -939,30 +979,50 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { scratch.size_per_thread = scratch_per_thread_; } + scratch.retry = false; + ScopedAcquire lock(&scratch_lock_); - scratch.queue_base = scratch_pool_.alloc(scratch.size); + // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue. + size_t small_limit = scratch_pool_.size() >> 3; + size_t single_limit = small_limit >> 2; + bool large = (scratch.size > single_limit) || + (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); + large = (isa_->GetMajorVersion() < 8) ? false : large; + if (large) + scratch.queue_base = scratch_pool_.alloc_high(scratch.size); + else + scratch.queue_base = scratch_pool_.alloc(scratch.size); + large |= scratch.queue_base > scratch_pool_.high_split(); + scratch.large = large; + scratch.queue_process_offset = (need_queue_scratch_base) ? uintptr_t(scratch.queue_base) : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); - if (scratch.queue_base != NULL) { + if (scratch.queue_base != nullptr) { if (profile_ == HSA_PROFILE_FULL) return; if (profile_ == HSA_PROFILE_BASE) { HSAuint64 alternate_va; - if (HSAKMT_STATUS_SUCCESS == - hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va)) + if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) == + HSAKMT_STATUS_SUCCESS) { + if (large) scratch_used_large_ += scratch.size; return; + } } } // Scratch request failed allocation or mapping. scratch_pool_.free(scratch.queue_base); - scratch.queue_base = NULL; + scratch.queue_base = nullptr; + + // Retry if large may yield needed space. + if (scratch_used_large_ != 0) { + scratch.retry = true; + return; + } -// Attempt to trim the maximum number of concurrent waves to allow scratch to fit. -// This is somewhat dangerous as it limits the number of concurrent waves from future dispatches -// on the queue if those waves use even small amounts of scratch. + // Attempt to trim the maximum number of concurrent waves to allow scratch to fit. if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) debug_print("Failed to map requested scratch - reducing queue occupancy.\n"); uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; @@ -973,7 +1033,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { size_t size = waves_per_cu * num_cus * size_per_wave; void* base = scratch_pool_.alloc(size); HSAuint64 alternate_va; - if ((base != NULL) && + if ((base != nullptr) && ((profile_ == HSA_PROFILE_FULL) || (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) { // Scratch allocated and either full profile or map succeeded. @@ -983,6 +1043,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { (need_queue_scratch_base) ? uintptr_t(scratch.queue_base) : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); + scratch.large = true; + scratch_used_large_ += scratch.size; return; } scratch_pool_.free(base); @@ -990,23 +1052,29 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { } // Failed to allocate minimal scratch - assert(scratch.queue_base == NULL && "bad scratch data"); + assert(scratch.queue_base == nullptr && "bad scratch data"); if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message()) debug_print("Could not allocate scratch for one wave per CU.\n"); } -void GpuAgent::ReleaseQueueScratch(void* base) { - if (base == NULL) { +void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) { + if (scratch.queue_base == nullptr) { return; } ScopedAcquire lock(&scratch_lock_); if (profile_ == HSA_PROFILE_BASE) { - if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) { + if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) { assert(false && "Unmap scratch subrange failed!"); } } - scratch_pool_.free(base); + scratch_pool_.free(scratch.queue_base); + + if (scratch.large) scratch_used_large_ -= scratch.size; + + // Notify waiters that additional scratch may be available. + for (auto notifier : scratch_notifiers_) + HSA::hsa_signal_or_relaxed(notifier.first, notifier.second); } void GpuAgent::TranslateTime(core::Signal* signal, @@ -1082,6 +1150,10 @@ uint16_t GpuAgent::GetMicrocodeVersion() const { return (properties_.EngineId.ui32.uCode); } +uint16_t GpuAgent::GetSdmaMicrocodeVersion() const { + return (properties_.uCodeEngineVersions.uCodeSDMA); +} + void GpuAgent::SyncClocks() { HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t1_); assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error"); diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index 99fbdd10a..39814839e 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -136,8 +136,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } - max_single_alloc_size_ = - AlignDown(static_cast(GetPhysicalSize()), kPageSize_); + max_single_alloc_size_ = AlignDown(static_cast(GetPhysicalSize()), kPageSize_); mem_flag_.ui32.CoarseGrain = (fine_grain) ? 0 : 1; @@ -288,16 +287,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, } break; case HSA_REGION_INFO_SIZE: - switch (mem_props_.HeapType) { - case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: - case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: - *((size_t*)value) = static_cast(GetPhysicalSize()); - break; - default: - *((size_t*)value) = static_cast( - (full_profile()) ? GetVirtualSize() : GetPhysicalSize()); - break; - } + *((size_t*)value) = static_cast(GetPhysicalSize()); break; case HSA_REGION_INFO_ALLOC_MAX_SIZE: switch (mem_props_.HeapType) { @@ -535,7 +525,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, lock.Release(); for (GpuAgentInt* gpu : whitelist_gpus) { - gpu->InitDma(); + gpu->PreloadBlits(); } return HSA_STATUS_SUCCESS; @@ -584,7 +574,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, if (agent->device_type() == core::Agent::kAmdGpuDevice) { whitelist_nodes.push_back(agent->node_id()); - whitelist_gpus.insert(reinterpret_cast(agent)); + whitelist_gpus.insert(agent); } } } @@ -607,8 +597,9 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, } else { *agent_ptr = host_ptr; } - for (core::Agent* gpu : whitelist_gpus) { - reinterpret_cast(gpu)->InitDma(); + + for (auto gpu : whitelist_gpus) { + static_cast(gpu)->PreloadBlits(); } return HSA_STATUS_SUCCESS; diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index 0c55e3810..c33b78f13 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -376,7 +376,7 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin return 0; } - char buff[3]; + char buff[6]; sprintf(buff, "%02u", minor); name += std::to_string(major) + "_" + buff + "_pfn_t"; @@ -1821,6 +1821,71 @@ hsa_status_t hsa_code_object_destroy( CATCH; } +static std::string ConvertOldTargetNameToNew( + const std::string &OldName, bool IsFinalizer, uint32_t EFlags) { + std::string NewName = ""; + + // FIXME #1: Should 9:0:3 be completely (loader, sc, etc.) removed? + // FIXME #2: What does PAL do with respect to boltzmann/usual fiji/tonga? + if (OldName == "AMD:AMDGPU:7:0:0") + NewName = "amdgcn-amd-amdhsa--gfx700"; + else if (OldName == "AMD:AMDGPU:7:0:1") + NewName = "amdgcn-amd-amdhsa--gfx701"; + else if (OldName == "AMD:AMDGPU:7:0:2") + NewName = "amdgcn-amd-amdhsa--gfx702"; + else if (OldName == "AMD:AMDGPU:7:0:3") + NewName = "amdgcn-amd-amdhsa--gfx703"; + else if (OldName == "AMD:AMDGPU:7:0:4") + NewName = "amdgcn-amd-amdhsa--gfx704"; + else if (OldName == "AMD:AMDGPU:8:0:0") + NewName = "amdgcn-amd-amdhsa--gfx800"; + else if (OldName == "AMD:AMDGPU:8:0:1") + NewName = "amdgcn-amd-amdhsa--gfx801"; + else if (OldName == "AMD:AMDGPU:8:0:2") + NewName = "amdgcn-amd-amdhsa--gfx802"; + else if (OldName == "AMD:AMDGPU:8:0:3") + NewName = "amdgcn-amd-amdhsa--gfx803"; + else if (OldName == "AMD:AMDGPU:8:0:4") + NewName = "amdgcn-amd-amdhsa--gfx804"; + else if (OldName == "AMD:AMDGPU:8:1:0") + NewName = "amdgcn-amd-amdhsa--gfx810"; + else if (OldName == "AMD:AMDGPU:9:0:0") + NewName = "amdgcn-amd-amdhsa--gfx900"; + else if (OldName == "AMD:AMDGPU:9:0:1") + NewName = "amdgcn-amd-amdhsa--gfx900"; + else if (OldName == "AMD:AMDGPU:9:0:2") + NewName = "amdgcn-amd-amdhsa--gfx902"; + else if (OldName == "AMD:AMDGPU:9:0:3") + NewName = "amdgcn-amd-amdhsa--gfx902"; + else if (OldName == "AMD:AMDGPU:9:0:4") + NewName = "amdgcn-amd-amdhsa--gfx904"; + else if (OldName == "AMD:AMDGPU:9:0:6") + NewName = "amdgcn-amd-amdhsa--gfx906"; + else + assert(false && "Unhandled target"); + + if (IsFinalizer && (EFlags & EF_AMDGPU_XNACK)) { + NewName = NewName + "+xnack"; + } else { + if (EFlags != 0 && (EFlags & EF_AMDGPU_XNACK_LC)) { + NewName = NewName + "+xnack"; + } else { + if (OldName == "AMD:AMDGPU:8:0:1") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:8:1:0") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:9:0:1") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:9:0:2") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:9:0:3") + NewName = NewName + "+xnack"; + } + } + + return NewName; +} + /* deprecated */ hsa_status_t hsa_code_object_get_info( hsa_code_object_t code_object, @@ -1843,8 +1908,26 @@ hsa_status_t hsa_code_object_get_info( return status; } + std::string isa_name_str(isa_name); + + bool IsFinalizer = true; + uint32_t codeHsailMajor; + uint32_t codeHsailMinor; + hsa_profile_t codeProfile; + hsa_machine_model_t codeMachineModel; + hsa_default_float_rounding_mode_t codeRoundingMode; + if (!code->GetNoteHsail(&codeHsailMajor, &codeHsailMinor, + &codeProfile, &codeMachineModel, + &codeRoundingMode)) { + // Only finalizer generated the "HSAIL" note. + IsFinalizer = false; + } + + std::string new_isa_name_str = + ConvertOldTargetNameToNew(isa_name_str, IsFinalizer, code->EFlags()); + hsa_isa_t isa_handle = {0}; - status = HSA::hsa_isa_from_name(isa_name, &isa_handle); + status = HSA::hsa_isa_from_name(new_isa_name_str.c_str(), &isa_handle); if (status != HSA_STATUS_SUCCESS) { return status; } @@ -2593,22 +2676,26 @@ hsa_status_t hsa_status_string( break; case HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER: *status_string = - "HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER: *The code object reader is invalid."; + "HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER: The code object reader is invalid."; break; case HSA_STATUS_ERROR_INVALID_CACHE: - *status_string = "HSA_STATUS_ERROR_INVALID_CACHE: *The cache is invalid."; + *status_string = "HSA_STATUS_ERROR_INVALID_CACHE: The cache is invalid."; break; case HSA_STATUS_ERROR_INVALID_WAVEFRONT: - *status_string = "HSA_STATUS_ERROR_INVALID_WAVEFRONT: *The wavefront is invalid."; + *status_string = "HSA_STATUS_ERROR_INVALID_WAVEFRONT: The wavefront is invalid."; break; case HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP: - *status_string = "HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP: *The signal group is invalid."; + *status_string = "HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP: The signal group is invalid."; break; case HSA_STATUS_ERROR_INVALID_RUNTIME_STATE: *status_string = - "HSA_STATUS_ERROR_INVALID_RUNTIME_STATE: *The HSA runtime is not in the configuration " + "HSA_STATUS_ERROR_INVALID_RUNTIME_STATE: The HSA runtime is not in the configuration " "state."; break; + case HSA_STATUS_ERROR_FATAL: + *status_string = + "HSA_STATUS_ERROR_FATAL: The queue received an error that may require process " + "termination."; case HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED: *status_string = "HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED: Image " diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index 9988dd359..0a70f0308 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -333,6 +333,16 @@ hsa_status_t hsa_amd_profiling_get_async_copy_time( return HSA_STATUS_ERROR; } + // Validate the embedded agent pointer if the signal is IPC. + if (signal->isIPC()) { + for (auto it : core::Runtime::runtime_singleton_->gpu_agents()) { + if (it == agent) break; + } + // If the agent isn't a GPU then it is from a different process or it's a CPU. + // Assume it's a CPU and illegal uses will generate garbage data same as kernel completion. + agent = core::Runtime::runtime_singleton_->cpu_agents()[0]; + } + if (agent->device_type() == core::Agent::DeviceType::kAmdGpuDevice) { // Translate timestamp from GPU to system domain. static_cast(agent)->TranslateTime(signal, *time); diff --git a/src/core/runtime/intercept_queue.cpp b/src/core/runtime/intercept_queue.cpp index 0beaddab0..4761a8f4f 100644 --- a/src/core/runtime/intercept_queue.cpp +++ b/src/core/runtime/intercept_queue.cpp @@ -70,7 +70,7 @@ int InterceptQueue::rtti_id_ = 0; InterceptQueue::InterceptQueue(std::unique_ptr queue) : QueueProxy(std::move(queue)), LocalSignal(0), - Signal(signal()), + DoorbellSignal(signal()), next_packet_(0), retry_index_(0), quit_(false), diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index a4a15fa03..431ed6ebf 100644 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -67,9 +67,13 @@ bool Wavefront::GetInfo( std::string Isa::GetFullName() const { std::stringstream full_name; - full_name << GetVendor() << ":" << GetArchitecture() << ":" - << GetMajorVersion() << ":" << GetMinorVersion() << ":" - << GetStepping(); + full_name << GetArchitecture() << "-" << GetVendor() << "-" << GetOS() << "-" + << GetEnvironment() << "-gfx" << GetMajorVersion() + << GetMinorVersion() << GetStepping(); + + if (xnackEnabled_) + full_name << "+xnack"; + return full_name.str(); } @@ -176,8 +180,8 @@ const Isa *IsaRegistry::GetIsa(const std::string &full_name) { return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second; } -const Isa *IsaRegistry::GetIsa(const Isa::Version &version) { - auto isareg_iter = supported_isas_.find(Isa(version).GetFullName()); +const Isa *IsaRegistry::GetIsa(const Isa::Version &version, bool xnack) { + auto isareg_iter = supported_isas_.find(Isa(version, xnack).GetFullName()); return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second; } @@ -185,25 +189,27 @@ const IsaRegistry::IsaMap IsaRegistry::supported_isas_ = IsaRegistry::GetSupportedIsas(); const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() { -#define ISAREG_ENTRY_GEN(maj, min, stp) \ - Isa amd_amdgpu_##maj##min##stp; \ - amd_amdgpu_##maj##min##stp.version_ = Isa::Version(maj, min, stp); \ - supported_isas.insert( \ - std::make_pair( \ - amd_amdgpu_##maj##min##stp.GetFullName(), amd_amdgpu_##maj##min##stp)); \ +#define ISAREG_ENTRY_GEN(maj, min, stp, xnack) \ + Isa amd_amdgpu_##maj##min##stp##xnack; \ + amd_amdgpu_##maj##min##stp##xnack.version_ = Isa::Version(maj, min, stp); \ + amd_amdgpu_##maj##min##stp##xnack.xnackEnabled_ = xnack; \ + supported_isas.insert(std::make_pair( \ + amd_amdgpu_##maj##min##stp##xnack.GetFullName(), \ + amd_amdgpu_##maj##min##stp##xnack)); \ IsaMap supported_isas; - ISAREG_ENTRY_GEN(7, 0, 0) - ISAREG_ENTRY_GEN(7, 0, 1) - ISAREG_ENTRY_GEN(7, 0, 2) - ISAREG_ENTRY_GEN(8, 0, 1) - ISAREG_ENTRY_GEN(8, 0, 2) - ISAREG_ENTRY_GEN(8, 0, 3) - ISAREG_ENTRY_GEN(9, 0, 0) - ISAREG_ENTRY_GEN(9, 0, 1) - ISAREG_ENTRY_GEN(9, 0, 2) - ISAREG_ENTRY_GEN(9, 0, 3) + ISAREG_ENTRY_GEN(7, 0, 0, false) + ISAREG_ENTRY_GEN(7, 0, 1, false) + ISAREG_ENTRY_GEN(7, 0, 2, false) + ISAREG_ENTRY_GEN(8, 0, 1, true) + ISAREG_ENTRY_GEN(8, 0, 2, false) + ISAREG_ENTRY_GEN(8, 0, 3, false) + ISAREG_ENTRY_GEN(8, 1, 0, true) + ISAREG_ENTRY_GEN(9, 0, 0, false) + ISAREG_ENTRY_GEN(9, 0, 2, true) + ISAREG_ENTRY_GEN(9, 0, 4, false) + ISAREG_ENTRY_GEN(9, 0, 6, false) return supported_isas; } diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 497d9b85c..0625c6f5f 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -108,16 +108,17 @@ hsa_status_t Runtime::Acquire() { } runtime_singleton_->ref_count_++; + MAKE_NAMED_SCOPE_GUARD(refGuard, [&]() { runtime_singleton_->ref_count_--; }); if (runtime_singleton_->ref_count_ == 1) { hsa_status_t status = runtime_singleton_->Load(); if (status != HSA_STATUS_SUCCESS) { - runtime_singleton_->ref_count_--; return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } } + refGuard.Dismiss(); return HSA_STATUS_SUCCESS; } @@ -1101,15 +1102,69 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { reason += "Unknown"; } - fprintf(stderr, - "Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n", - fault.NodeId, reinterpret_cast(fault.VirtualAddress), - (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", - reason.c_str()); - } else { - assert(false && "GPU memory access fault."); + core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0]; + + fprintf( + stderr, + "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n", + fault.NodeId, reinterpret_cast(faultingAgent->public_handle().handle), + reinterpret_cast(fault.VirtualAddress), + (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str()); + +#ifndef NDEBUG + runtime_singleton_->memory_lock_.Acquire(); + auto it = runtime_singleton_->allocation_map_.upper_bound( + reinterpret_cast(fault.VirtualAddress)); + for (int i = 0; i < 2; i++) { + if (it != runtime_singleton_->allocation_map_.begin()) it--; + } + fprintf(stderr, "Nearby memory map:\n"); + auto start = it; + for (int i = 0; i < 3; i++) { + if (it == runtime_singleton_->allocation_map_.end()) break; + std::string kind = "Non-HSA"; + if (it->second.region != nullptr) { + const amd::MemoryRegion* region = + static_cast(it->second.region); + if (region->IsSystem()) + kind = "System"; + else if (region->IsLocalMemory()) + kind = "VRAM"; + else if (region->IsScratch()) + kind = "Scratch"; + else if (region->IsLDS()) + kind = "LDS"; + } + fprintf(stderr, "%p, 0x%lx, %s\n", it->first, it->second.size, kind.c_str()); + it++; + } + fprintf(stderr, "\n"); + it = start; + runtime_singleton_->memory_lock_.Release(); + hsa_amd_pointer_info_t info; + PtrInfoBlockData block; + uint32_t count; + hsa_agent_t* canAccess; + info.size = sizeof(info); + for (int i = 0; i < 3; i++) { + if (it == runtime_singleton_->allocation_map_.end()) break; + runtime_singleton_->PtrInfo(const_cast(it->first), &info, malloc, &count, &canAccess, + &block); + fprintf(stderr, + "PtrInfo:\n\tAddress: %p-%p/%p-%p\n\tSize: 0x%lx\n\tType: %u\n\tOwner: %p\n", + info.agentBaseAddress, (char*)info.agentBaseAddress + info.sizeInBytes, + info.hostBaseAddress, (char*)info.hostBaseAddress + info.sizeInBytes, + info.sizeInBytes, info.type, reinterpret_cast(info.agentOwner.handle)); + fprintf(stderr, "\tCanAccess: %u\n", count); + for (int t = 0; t < count; t++) + fprintf(stderr, "\t\t%p\n", reinterpret_cast(canAccess[t].handle)); + fprintf(stderr, "\tIn block: %p, 0x%lx\n", block.base, block.length); + free(canAccess); + it++; + } +#endif //! NDEBUG } - + assert(false && "GPU memory access fault."); std::abort(); } // No need to keep the signal because we are done. @@ -1167,6 +1222,9 @@ void Runtime::Unload() { amd::hsa::loader::Loader::Destroy(loader_); loader_ = nullptr; + std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject()); + gpu_agents_.clear(); + async_events_control_.Shutdown(); if (vm_fault_signal_ != nullptr) { diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 1de3b15ab..7593bf36e 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -69,8 +69,7 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_INTERRUPT"); enable_interrupt_ = (var == "0") ? false : true; - var = os::GetEnvVar("HSA_ENABLE_SDMA"); - enable_sdma_ = (var == "0") ? false : true; + enable_sdma_ = os::GetEnvVar("HSA_ENABLE_SDMA"); var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND"); running_valgrind_ = (var == "1") ? true : false; @@ -91,17 +90,20 @@ class Flag { var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR"); disable_fragment_alloc_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH"); + enable_sdma_hdp_flush_ = (var == "0") ? false : true; } bool check_flat_scratch() const { return check_flat_scratch_; } bool enable_vm_fault_message() const { return enable_vm_fault_message_; } - + bool enable_queue_fault_message() const { return enable_queue_fault_message_; } bool enable_interrupt() const { return enable_interrupt_; } - bool enable_sdma() const { return enable_sdma_; } + bool enable_sdma_hdp_flush() const { return enable_sdma_hdp_flush_; } bool running_valgrind() const { return running_valgrind_; } @@ -111,6 +113,8 @@ class Flag { bool disable_fragment_alloc() const { return disable_fragment_alloc_; } + std::string enable_sdma() const { return enable_sdma_; } + uint32_t max_queues() const { return max_queues_; } size_t scratch_mem_size() const { return scratch_mem_size_; } @@ -121,13 +125,15 @@ class Flag { bool check_flat_scratch_; bool enable_vm_fault_message_; bool enable_interrupt_; - bool enable_sdma_; + bool enable_sdma_hdp_flush_; bool running_valgrind_; bool sdma_wait_idle_; bool enable_queue_fault_message_; bool report_tool_load_failures_; bool disable_fragment_alloc_; + std::string enable_sdma_; + uint32_t max_queues_; size_t scratch_mem_size_; diff --git a/src/core/util/lazy_ptr.h b/src/core/util/lazy_ptr.h new file mode 100644 index 000000000..7837200d8 --- /dev/null +++ b/src/core/util/lazy_ptr.h @@ -0,0 +1,125 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ +#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ + +#include +#include +#include + +#include "core/util/utils.h" + +/* + * Wrapper for a std::unique_ptr that initializes its object at first use. + */ +template class lazy_ptr { + public: + lazy_ptr() {} + + explicit lazy_ptr(std::function Constructor) { Init(Constructor); } + + void reset(std::function Constructor = nullptr) { + obj.reset(); + func = Constructor; + } + + void reset(T* ptr) { + obj.reset(ptr); + func = nullptr; + } + + bool operator==(T* rhs) const { return obj.get() == rhs; } + bool operator!=(T* rhs) const { return obj.get() != rhs; } + + const std::unique_ptr& operator->() const { + make(true); + return obj; + } + + std::unique_ptr& operator*() { + make(true); + return obj; + } + + const std::unique_ptr& operator*() const { + make(true); + return obj; + } + + /* + * Ensures that the object is created or is being created. + * This is useful when early consruction of the object is required. + */ + void touch() const { make(false); } + + private: + mutable std::unique_ptr obj; + mutable std::function func; + mutable KernelMutex lock; + + // Separated from make to improve inlining. + void make_body(bool block) const { + if (block) { + lock.Acquire(); + } else if (!lock.Try()) { + return; + } + MAKE_SCOPE_GUARD([&]() { lock.Release(); }); + if (obj != nullptr) return; + T* ptr = func(); + std::atomic_thread_fence(std::memory_order_release); + obj.reset(ptr); + func = nullptr; + } + + __forceinline void make(bool block) const { + std::atomic_thread_fence(std::memory_order_acquire); + if (obj == nullptr) { + make_body(block); + } + } + + DISALLOW_COPY_AND_ASSIGN(lazy_ptr); +}; + +#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ diff --git a/src/core/util/small_heap.cpp b/src/core/util/small_heap.cpp index 9b8998bf3..6cd8e117d 100644 --- a/src/core/util/small_heap.cpp +++ b/src/core/util/small_heap.cpp @@ -42,25 +42,47 @@ #include "small_heap.h" -SmallHeap::memory_t::iterator SmallHeap::merge( - SmallHeap::memory_t::iterator& keep, - SmallHeap::memory_t::iterator& destroy) { - assert((char*)keep->first + keep->second.len == (char*)destroy->first && - "Invalid merge"); - assert(keep->second.isfree() && "Merge with allocated block"); - assert(destroy->second.isfree() && "Merge with allocated block"); - - keep->second.len += destroy->second.len; - keep->second.next_free = destroy->second.next_free; - if (!destroy->second.islastfree()) - memory[destroy->second.next_free].prior_free = keep->first; - - memory.erase(destroy); - return keep; +// Inserts node into freelist after place. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) { + assert(place->first < node->first && "Order violation"); + assert(isfree(place->second) && "Freelist operation error."); + iterator_t next = place->second.next; + node->second.next = next; + node->second.prior = place; + place->second.next = node; + next->second.prior = node; +} + +// Removes node from freelist. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::remove(SmallHeap::iterator_t node) { + assert(isfree(node->second) && "Freelist operation error."); + node->second.prior->second.next = node->second.next; + node->second.next->second.prior = node->second.prior; + setused(node->second); +} + +// Returns high if merge failed or the merged node. +SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low, + SmallHeap::memory_t::iterator high) { + assert(isfree(low->second) && "Merge with allocated block"); + assert(isfree(high->second) && "Merge with allocated block"); + + if ((char*)low->first + low->second.len != (char*)high->first) return high; + + assert(!islastfree(high->second) && "Illegal merge."); + + low->second.len += high->second.len; + low->second.next = high->second.next; + high->second.next->second.prior = low; + + memory.erase(high); + return low; } void SmallHeap::free(void* ptr) { - if (ptr == NULL) return; + if (ptr == nullptr) return; auto iterator = memory.find(ptr); @@ -70,105 +92,90 @@ void SmallHeap::free(void* ptr) { return; } - const auto start_guard = memory.find(0); - const auto end_guard = memory.find((void*)0xFFFFFFFFFFFFFFFFull); - // Return memory to total and link node into free list total_free += iterator->second.len; - if (first_free < iterator->first) { - auto before = iterator; - before--; - while (before != start_guard && !before->second.isfree()) before--; - assert(before->second.next_free > iterator->first && - "Inconsistency in small heap."); - iterator->second.prior_free = before->first; - iterator->second.next_free = before->second.next_free; - before->second.next_free = iterator->first; - if (!iterator->second.islastfree()) - memory[iterator->second.next_free].prior_free = iterator->first; - } else { - iterator->second.setfirstfree(); - iterator->second.next_free = first_free; - first_free = iterator->first; - if (!iterator->second.islastfree()) - memory[iterator->second.next_free].prior_free = iterator->first; - } - // Attempt compaction + // Could also traverse the free list which might be faster in some cases. auto before = iterator; before--; - if (before != start_guard) { - if (before->second.isfree()) { - iterator = merge(before, iterator); - } - } + while (!isfree(before->second)) before--; + assert(before->second.next->first > iterator->first && "Inconsistency in small heap."); + insertafter(before, iterator); - auto after = iterator; - after++; - if (after != end_guard) { - if (after->second.isfree()) { - iterator = merge(iterator, after); - } - } + // Attempt compaction + iterator = merge(before, iterator); + merge(iterator, iterator->second.next); + + // Update lowHighBondary + high.erase(ptr); } void* SmallHeap::alloc(size_t bytes) { // Is enough memory available? - if ((bytes > total_free) || (bytes == 0)) return NULL; + if ((bytes > total_free) || (bytes == 0)) return nullptr; - memory_t::iterator current; - memory_t::iterator prior; + iterator_t current; // Walk the free list and allocate at first fitting location - prior = current = memory.find(first_free); - while (true) { + current = firstfree(); + while (!islastfree(current->second)) { if (bytes <= current->second.len) { // Decrement from total total_free -= bytes; - // Is allocation an exact fit? - if (bytes == current->second.len) { - if (prior == current) { - first_free = current->second.next_free; - if (!current->second.islastfree()) - memory[current->second.next_free].setfirstfree(); - } else { - prior->second.next_free = current->second.next_free; - if (!current->second.islastfree()) - memory[current->second.next_free].prior_free = prior->first; - } - current->second.next_free = NULL; - return current->first; - } else { - // Split current node + // Split node + if (bytes != current->second.len) { void* remaining = (char*)current->first + bytes; Node& node = memory[remaining]; - node.next_free = current->second.next_free; - node.prior_free = current->second.prior_free; node.len = current->second.len - bytes; current->second.len = bytes; - - if (prior == current) { - first_free = remaining; - node.setfirstfree(); - } else { - prior->second.next_free = remaining; - node.prior_free = prior->first; - } - if (!node.islastfree()) memory[node.next_free].prior_free = remaining; - - current->second.next_free = NULL; - return current->first; + insertafter(current, memory.find(remaining)); } + + remove(current); + return current->first; } + current = current->second.next; + } + assert(current->second.len == 0 && "Freelist corruption."); - // End of free list? - if (current->second.islastfree()) break; + // Can't service the request due to fragmentation + return nullptr; +} - prior = current; - current = memory.find(current->second.next_free); +void* SmallHeap::alloc_high(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = lastfree(); + while (!isfirstfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + void* alloc; + // Split node + if (bytes != current->second.len) { + alloc = (char*)current->first + current->second.len - bytes; + current->second.len -= bytes; + Node& node = memory[alloc]; + node.len = bytes; + setused(node); + } else { + alloc = current->first; + remove(current); + } + + high.insert(alloc); + return alloc; + } + current = current->second.prior; } + assert(current->second.len == 0 && "Freelist corruption."); // Can't service the request due to fragmentation - return NULL; + return nullptr; } diff --git a/src/core/util/small_heap.h b/src/core/util/small_heap.h index fecfe857f..7a366564a 100644 --- a/src/core/util/small_heap.h +++ b/src/core/util/small_heap.h @@ -50,65 +50,78 @@ #include "utils.h" #include +#include class SmallHeap { - public: - class Node { - public: + private: + struct Node; + typedef std::map memory_t; + typedef memory_t::iterator iterator_t; + + struct Node { size_t len; - void* next_free; - void* prior_free; - static const intptr_t END = -1; - - __forceinline bool isfree() const { return next_free != NULL; } - __forceinline bool islastfree() const { return intptr_t(next_free) == END; } - __forceinline bool isfirstfree() const { - return intptr_t(prior_free) == END; - } - __forceinline void setlastfree() { - *reinterpret_cast(&next_free) = END; - } - __forceinline void setfirstfree() { - *reinterpret_cast(&prior_free) = END; - } + iterator_t next; + iterator_t prior; }; - private: - SmallHeap(const SmallHeap& rhs); - SmallHeap& operator=(const SmallHeap& rhs); + SmallHeap(const SmallHeap& rhs) = delete; + SmallHeap& operator=(const SmallHeap& rhs) = delete; void* const pool; const size_t length; size_t total_free; - void* first_free; - std::map memory; - - typedef decltype(memory) memory_t; - memory_t::iterator merge(memory_t::iterator& keep, - memory_t::iterator& destroy); + memory_t memory; + std::set high; + + __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); } + __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); } + __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); } + __forceinline void setlastfree(Node& node) { node.next = memory.end(); } + __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); } + __forceinline void setused(Node& node) { node.next = memory.begin(); } + + __forceinline iterator_t firstfree() { return memory.begin()->second.next; } + __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; } + void insertafter(iterator_t place, iterator_t node); + void remove(iterator_t node); + iterator_t merge(iterator_t low, iterator_t high); public: - SmallHeap() : pool(NULL), length(0), total_free(0) {} + SmallHeap() : pool(nullptr), length(0), total_free(0) {} SmallHeap(void* base, size_t length) : pool(base), length(length), total_free(length) { - first_free = pool; + assert(pool != nullptr && "Invalid base address."); + assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address."); + assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds."); + + Node& start = memory[0]; + Node& node = memory[pool]; + Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull]; + + start.len = 0; + start.next = memory.find(pool); + setfirstfree(start); - Node& node = memory[first_free]; node.len = length; - node.setlastfree(); - node.setfirstfree(); + node.prior = memory.begin(); + node.next = --memory.end(); + + end.len = 0; + end.prior = start.next; + setlastfree(end); - memory[0].len = 0; - memory[(void*)0xFFFFFFFFFFFFFFFFull].len = 0; + high.insert((void*)0xFFFFFFFFFFFFFFFFull); } void* alloc(size_t bytes); + void* alloc_high(size_t bytes); void free(void* ptr); void* base() const { return pool; } size_t size() const { return length; } size_t remaining() const { return total_free; } + void* high_split() const { return *high.begin(); } }; #endif diff --git a/src/inc/amd_hsa_common.h b/src/inc/amd_hsa_common.h index ca6a2b983..bfb613ec4 100644 --- a/src/inc/amd_hsa_common.h +++ b/src/inc/amd_hsa_common.h @@ -75,8 +75,8 @@ // Creates enumeration entries for packed types. Enumeration entries include // bit shift amount, bit width, and bit mask. #define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ - name ## _SHIFT = (shift), \ - name ## _WIDTH = (width), \ + name##_SHIFT = (shift), \ + name##_WIDTH = (width), \ name = (((1 << (width)) - 1) << (shift)) \ // Gets bits for specified mask from specified src packed instance. @@ -85,7 +85,7 @@ // Sets val bits for specified mask in specified dst packed instance. #define AMD_HSA_BITS_SET(dst, mask, val) \ - dst &= (~(1 << mask ## _SHIFT) & ~mask); \ - dst |= (((val) << mask ## _SHIFT) & mask) \ + dst &= (~(1 << mask##_SHIFT) & ~mask); \ + dst |= (((val) << mask##_SHIFT) & mask) \ #endif // AMD_HSA_COMMON_H diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h index 95f89c635..e0702b175 100644 --- a/src/inc/amd_hsa_elf.h +++ b/src/inc/amd_hsa_elf.h @@ -52,6 +52,10 @@ #define EF_AMDGPU_XNACK 0x00000001 #define EF_AMDGPU_TRAP_HANDLER 0x00000002 +// FIXME: We really need to start thinking about separating legacy code out, +// it is getting messy. +#define EF_AMDGPU_XNACK_LC 0x100 + // ELF Section Header Flag Enumeration Values. #define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) #define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) @@ -116,6 +120,7 @@ typedef enum { #define R_AMDGPU_64 3 #define R_AMDGPU_INIT_SAMPLER 4 #define R_AMDGPU_INIT_IMAGE 5 +#define R_AMDGPU_RELATIVE64 13 // AMD GPU Note Type Enumeration Values. #define NT_AMDGPU_HSA_CODE_OBJECT_VERSION 1 diff --git a/src/inc/amd_hsa_queue.h b/src/inc/amd_hsa_queue.h index b37bb53f3..2176e8470 100644 --- a/src/inc/amd_hsa_queue.h +++ b/src/inc/amd_hsa_queue.h @@ -53,7 +53,8 @@ enum amd_queue_properties_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 4, 28) + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27) }; // AMD Queue. diff --git a/src/inc/hsa.h b/src/inc/hsa.h index b8ec90a66..0ed2b689e 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -116,148 +116,151 @@ extern "C" { * @brief Status codes. */ typedef enum { - /** - * The function has been executed successfully. - */ - HSA_STATUS_SUCCESS = 0x0, - /** - * A traversal over a list of elements has been interrupted by the - * application before completing. - */ - HSA_STATUS_INFO_BREAK = 0x1, - /** - * A generic error has occurred. - */ - HSA_STATUS_ERROR = 0x1000, - /** - * One of the actual arguments does not meet a precondition stated in the - * documentation of the corresponding formal argument. - */ - HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, - /** - * The requested queue creation is not valid. - */ - HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, - /** - * The requested allocation is not valid. - */ - HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, - /** - * The agent is invalid. - */ - HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, - /** - * The memory region is invalid. - */ - HSA_STATUS_ERROR_INVALID_REGION = 0x1005, - /** - * The signal is invalid. - */ - HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, - /** - * The queue is invalid. - */ - HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, - /** - * The HSA runtime failed to allocate the necessary resources. This error - * may also occur when the HSA runtime needs to spawn threads or create - * internal OS-specific events. - */ - HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, - /** - * The AQL packet is malformed. - */ - HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, - /** - * An error has been detected while releasing a resource. - */ - HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, - /** - * An API other than ::hsa_init has been invoked while the reference count - * of the HSA runtime is 0. - */ - HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, - /** - * The maximum reference count for the object has been reached. - */ - HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, - /** - * The arguments passed to a functions are not compatible. - */ - HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, - /** - * The index is invalid. - */ - HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, - /** - * The instruction set architecture is invalid. - */ - HSA_STATUS_ERROR_INVALID_ISA = 0x100F, - /** - * The instruction set architecture name is invalid. - */ - HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, - /** - * The code object is invalid. - */ - HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, - /** - * The executable is invalid. - */ - HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, - /** - * The executable is frozen. - */ - HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, - /** - * There is no symbol with the given name. - */ - HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, - /** - * The variable is already defined. - */ - HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, - /** - * The variable is undefined. - */ - HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, - /** - * An HSAIL operation resulted in a hardware exception. - */ - HSA_STATUS_ERROR_EXCEPTION = 0x1016, - /** - * The code object symbol is invalid. - */ - HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018, - /** - * The executable symbol is invalid. - */ - HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019, - /** - * The file descriptor is invalid. - */ - HSA_STATUS_ERROR_INVALID_FILE = 0x1020, - /** - * The code object reader is invalid. - */ - HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021, - /** - * The cache is invalid. - */ - HSA_STATUS_ERROR_INVALID_CACHE = 0x1022, - /** - * The wavefront is invalid. - */ - HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023, - /** - * The signal group is invalid. - */ - HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024, - /** - * The HSA runtime is not in the configuration state. - */ - HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025 - + /** + * The function has been executed successfully. + */ + HSA_STATUS_SUCCESS = 0x0, + /** + * A traversal over a list of elements has been interrupted by the + * application before completing. + */ + HSA_STATUS_INFO_BREAK = 0x1, + /** + * A generic error has occurred. + */ + HSA_STATUS_ERROR = 0x1000, + /** + * One of the actual arguments does not meet a precondition stated in the + * documentation of the corresponding formal argument. + */ + HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, + /** + * The requested queue creation is not valid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, + /** + * The requested allocation is not valid. + */ + HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, + /** + * The agent is invalid. + */ + HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, + /** + * The memory region is invalid. + */ + HSA_STATUS_ERROR_INVALID_REGION = 0x1005, + /** + * The signal is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, + /** + * The queue is invalid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, + /** + * The HSA runtime failed to allocate the necessary resources. This error + * may also occur when the HSA runtime needs to spawn threads or create + * internal OS-specific events. + */ + HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, + /** + * The AQL packet is malformed. + */ + HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, + /** + * An error has been detected while releasing a resource. + */ + HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, + /** + * An API other than ::hsa_init has been invoked while the reference count + * of the HSA runtime is 0. + */ + HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, + /** + * The maximum reference count for the object has been reached. + */ + HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, + /** + * The arguments passed to a functions are not compatible. + */ + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, + /** + * The index is invalid. + */ + HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, + /** + * The instruction set architecture is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA = 0x100F, + /** + * The instruction set architecture name is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, + /** + * The code object is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, + /** + * The executable is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, + /** + * The executable is frozen. + */ + HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, + /** + * There is no symbol with the given name. + */ + HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, + /** + * The variable is already defined. + */ + HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, + /** + * The variable is undefined. + */ + HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, + /** + * An HSAIL operation resulted in a hardware exception. + */ + HSA_STATUS_ERROR_EXCEPTION = 0x1016, + /** + * The code object symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018, + /** + * The executable symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019, + /** + * The file descriptor is invalid. + */ + HSA_STATUS_ERROR_INVALID_FILE = 0x1020, + /** + * The code object reader is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021, + /** + * The cache is invalid. + */ + HSA_STATUS_ERROR_INVALID_CACHE = 0x1022, + /** + * The wavefront is invalid. + */ + HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023, + /** + * The signal group is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024, + /** + * The HSA runtime is not in the configuration state. + */ + HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025, + /** + * The queue received an error that may require process termination. + */ + HSA_STATUS_ERROR_FATAL = 0x1026 } hsa_status_t; /** diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h old mode 100755 new mode 100644 index 78db49608..b4b95e2f3 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -299,9 +299,8 @@ hsa_status_t HSA_API * @brief Retrieve packet processing time stamps. * * @param[in] agent The agent with which the signal was last used. For - *instance, - * if the profiled dispatch packet is dispatched on to queue Q, which was - * created on agent A, then this parameter must be A. + * instance, if the profiled dispatch packet is dispatched onto queue Q, + * which was created on agent A, then this parameter must be A. * * @param[in] signal A signal used as the completion signal of the dispatch * packet to retrieve time stamps from. This dispatch packet must have been @@ -388,8 +387,10 @@ typedef enum { HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1, /** * Signal may be used for interprocess communication. - * This signal may not be used with profiling APIs. Errors or inaccurate - * timing data may result from such use. + * IPC signals can be read, written, and waited on from any process. + * Profiling using an IPC enabled signal is only supported in a single process + * at a time. Producing profiling data in one process and consuming it in + * another process is undefined. */ HSA_AMD_SIGNAL_IPC = 2, } hsa_amd_signal_attribute_t; diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp index 80c25d520..ee9ff3135 100644 --- a/src/libamdhsacode/amd_elf_image.cpp +++ b/src/libamdhsacode/amd_elf_image.cpp @@ -703,6 +703,7 @@ namespace amd { uint16_t Machine() override { return ehdr.e_machine; } uint16_t Type() override { return ehdr.e_type; } + uint32_t EFlags() override{ return ehdr.e_flags; } GElfStringTable* shstrtab() override; GElfStringTable* strtab() override; diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index 7b88e0d4a..eb3511f18 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -1016,6 +1016,71 @@ static uint32_t NextCodeObjectNum() return dumpN++; } +static std::string ConvertOldTargetNameToNew( + const std::string &OldName, bool IsFinalizer, uint32_t EFlags) { + std::string NewName = ""; + + // FIXME #1: Should 9:0:3 be completely (loader, sc, etc.) removed? + // FIXME #2: What does PAL do with respect to boltzmann/usual fiji/tonga? + if (OldName == "AMD:AMDGPU:7:0:0") + NewName = "amdgcn-amd-amdhsa--gfx700"; + else if (OldName == "AMD:AMDGPU:7:0:1") + NewName = "amdgcn-amd-amdhsa--gfx701"; + else if (OldName == "AMD:AMDGPU:7:0:2") + NewName = "amdgcn-amd-amdhsa--gfx702"; + else if (OldName == "AMD:AMDGPU:7:0:3") + NewName = "amdgcn-amd-amdhsa--gfx703"; + else if (OldName == "AMD:AMDGPU:7:0:4") + NewName = "amdgcn-amd-amdhsa--gfx704"; + else if (OldName == "AMD:AMDGPU:8:0:0") + NewName = "amdgcn-amd-amdhsa--gfx800"; + else if (OldName == "AMD:AMDGPU:8:0:1") + NewName = "amdgcn-amd-amdhsa--gfx801"; + else if (OldName == "AMD:AMDGPU:8:0:2") + NewName = "amdgcn-amd-amdhsa--gfx802"; + else if (OldName == "AMD:AMDGPU:8:0:3") + NewName = "amdgcn-amd-amdhsa--gfx803"; + else if (OldName == "AMD:AMDGPU:8:0:4") + NewName = "amdgcn-amd-amdhsa--gfx804"; + else if (OldName == "AMD:AMDGPU:8:1:0") + NewName = "amdgcn-amd-amdhsa--gfx810"; + else if (OldName == "AMD:AMDGPU:9:0:0") + NewName = "amdgcn-amd-amdhsa--gfx900"; + else if (OldName == "AMD:AMDGPU:9:0:1") + NewName = "amdgcn-amd-amdhsa--gfx900"; + else if (OldName == "AMD:AMDGPU:9:0:2") + NewName = "amdgcn-amd-amdhsa--gfx902"; + else if (OldName == "AMD:AMDGPU:9:0:3") + NewName = "amdgcn-amd-amdhsa--gfx902"; + else if (OldName == "AMD:AMDGPU:9:0:4") + NewName = "amdgcn-amd-amdhsa--gfx904"; + else if (OldName == "AMD:AMDGPU:9:0:6") + NewName = "amdgcn-amd-amdhsa--gfx906"; + else + assert(false && "Unhandled target"); + + if (IsFinalizer && (EFlags & EF_AMDGPU_XNACK)) { + NewName = NewName + "+xnack"; + } else { + if (EFlags != 0 && (EFlags & EF_AMDGPU_XNACK_LC)) { + NewName = NewName + "+xnack"; + } else { + if (OldName == "AMD:AMDGPU:8:0:1") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:8:1:0") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:9:0:1") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:9:0:2") + NewName = NewName + "+xnack"; + else if (OldName == "AMD:AMDGPU:9:0:3") + NewName = NewName + "+xnack"; + } + } + + return NewName; +} + hsa_status_t ExecutableImpl::LoadCodeObject( hsa_agent_t agent, hsa_code_object_t code_object, @@ -1110,9 +1175,6 @@ hsa_status_t ExecutableImpl::LoadCodeObject( std::string codeIsa; if (!code->GetNoteIsa(codeIsa)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - hsa_isa_t objectsIsa = context_->IsaFromName(codeIsa.c_str()); - if (!objectsIsa.handle) { return HSA_STATUS_ERROR_INVALID_ISA_NAME; } - uint32_t majorVersion, minorVersion; if (!code->GetNoteCodeObjectVersion(&majorVersion, &minorVersion)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; @@ -1120,20 +1182,30 @@ hsa_status_t ExecutableImpl::LoadCodeObject( if (majorVersion != 1 && majorVersion != 2) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } if (agent.handle == 0 && majorVersion == 1) { return HSA_STATUS_ERROR_INVALID_AGENT; } - if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa)) { return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; } + bool IsFinalizer = true; uint32_t codeHsailMajor; uint32_t codeHsailMinor; hsa_profile_t codeProfile; hsa_machine_model_t codeMachineModel; hsa_default_float_rounding_mode_t codeRoundingMode; if (!code->GetNoteHsail(&codeHsailMajor, &codeHsailMinor, &codeProfile, &codeMachineModel, &codeRoundingMode)) { + // Only finalizer generated the "HSAIL" note. + IsFinalizer = false; codeProfile = HSA_PROFILE_FULL; } if (profile_ != codeProfile) { return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; } + codeIsa = ConvertOldTargetNameToNew(codeIsa, IsFinalizer, code->EFlags()); + hsa_isa_t objectsIsa = context_->IsaFromName(codeIsa.c_str()); + if (!objectsIsa.handle) { return HSA_STATUS_ERROR_INVALID_ISA_NAME; } + + if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa)) { + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + } + hsa_status_t status; objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize())); @@ -1673,14 +1745,16 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa: break; } - case R_AMDGPU_INIT_IMAGE: - case R_AMDGPU_INIT_SAMPLER: - // Images and samplers are not supported in v2.1. - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + case R_AMDGPU_RELATIVE64: + { + int64_t baseDelta = reinterpret_cast(relSeg->Address(0)) - relSeg->VAddr(); + uint64_t relocatedAddr = baseDelta + rel->addend(); + relSeg->Copy(rel->offset(), &relocatedAddr, sizeof(relocatedAddr)); + break; + } default: - // Ignore. - break; + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } return HSA_STATUS_SUCCESS; } diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp old mode 100755 new mode 100644 index 7a795acce..624a8c894 --- a/src/loader/loaders.cpp +++ b/src/loader/loaders.cpp @@ -87,22 +87,16 @@ namespace loader { gfx803.handle = 803; gfx804.handle = 804; gfx810.handle = 810; -#if defined(GFX9_BUILD) gfx900.handle = 900; gfx901.handle = 901; - gfx901.handle = 902; + gfx902.handle = 902; gfx903.handle = 903; - gfx903.handle = 904; - gfx903.handle = 905; - gfx903.handle = 906; - gfx907.handle = 907; -#endif // GFX9_BUILD } hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) { std::string sname(name); - if (sname == "AMD:AMDGPU:7:0:0") { + if (sname == "AMD:AMDGPU:7:0:0") { return gfx700; } else if (sname == "AMD:AMDGPU:7:0:1") { return gfx701; @@ -118,7 +112,6 @@ namespace loader { return gfx804; } else if (sname == "AMD:AMDGPU:8:1:0") { return gfx810; -#if defined(GFX9_BUILD) } else if (sname == "AMD:AMDGPU:9:0:0") { return gfx900; } else if (sname == "AMD:AMDGPU:9:0:1") { @@ -127,19 +120,10 @@ namespace loader { return gfx902; } else if (sname == "AMD:AMDGPU:9:0:3") { return gfx903; - } else if (sname == "AMD:AMDGPU:9:0:4") { - return gfx904; - } else if (sname == "AMD:AMDGPU:9:0:5") { - return gfx905; - } else if (sname == "AMD:AMDGPU:9:0:6") { - return gfx906; - } else if (sname == "AMD:AMDGPU:9:0:7") { - return gfx907; -#endif // GFX_BUILD - } else { - assert(0); - return invalid; } + + assert(0); + return invalid; } bool OfflineLoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp index 81e13fbc6..94b3ceca3 100644 --- a/src/loader/loaders.hpp +++ b/src/loader/loaders.hpp @@ -55,11 +55,7 @@ namespace loader { private: hsa_isa_t invalid; hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810; -#if defined(GFX9_BUILD) - hsa_isa_t gfx900, gfx901; -#else - hsa_isa_t reserved; -#endif // GFX9_BUILD + hsa_isa_t gfx900, gfx901, gfx902, gfx903; std::ostream& out; typedef std::set PointerSet; PointerSet pointers;