diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f29218229..9796aaef1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -84,7 +84,12 @@ endif() get_version ( "1.1.9" ) set (SO_MAJOR 1) set (SO_MINOR 1) -set (SO_PATCH 9) +if ( ${ROCM_PATCH_VERSION} ) + set ( SO_PATCH ${ROCM_PATCH_VERSION}) + set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) +else () + set(SO_PATCH 9) +endif () set ( SO_VERSION_STRING "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}" ) set ( PACKAGE_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}.${VERSION_COMMIT_COUNT}-${VERSION_JOB}-${VERSION_HASH}" ) @@ -106,7 +111,7 @@ include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode ) add_definitions ( -DROCR_BUILD_ID=${PACKAGE_VERSION_STRING} ) ## Set RUNPATH - ../../lib covers use of the legacy symlink in /hsa/lib/ -set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib") +set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64") ## ------------------------- Linux Compiler and Linker options ------------------------- set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function" ) diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h index 801f58660..a0de033a1 100644 --- a/src/core/inc/amd_aql_queue.h +++ b/src/core/inc/amd_aql_queue.h @@ -74,6 +74,9 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo /// @brief Change the scheduling priority of the queue hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override; + /// @brief Destroy ref counted queue + void Destroy() override; + /// @brief Atomically reads the Read index of with Acquire semantics /// /// @return uint64_t Value of read index diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index 61aefcdec..a538518af 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -256,6 +256,9 @@ class GpuAgent : public GpuAgentInt { uint32_t group_segment_size, core::Queue** queue) override; + // @brief Decrement GWS ref count. + void GWSRelease(); + // @brief Override from amd::GpuAgentInt. void AcquireQueueScratch(ScratchInfo& scratch) override; @@ -488,6 +491,9 @@ class GpuAgent : public GpuAgentInt { // @brief Create internal queues and blits. void InitDma(); + // @brief Setup GWS accessing queue. + void InitGWS(); + // Bind index of peer device that is connected via xGMI links lazy_ptr& GetXgmiBlit(const core::Agent& peer_agent); @@ -504,6 +510,13 @@ class GpuAgent : public GpuAgentInt { // @brief Alternative aperture size. Only on KV. size_t ape1_size_; + // @brief Queue with GWS access. + struct { + lazy_ptr queue_; + int ref_ct_; + KernelMutex lock_; + } gws_queue_; + DISALLOW_COPY_AND_ASSIGN(GpuAgent); }; diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h index ba8200e82..c7f07342a 100644 --- a/src/core/inc/queue.h +++ b/src/core/inc/queue.h @@ -150,6 +150,8 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { virtual ~Queue() {} + virtual void Destroy() { delete this; } + /// @brief Returns the handle of Queue's public data type /// /// @param queue Pointer to an instance of Queue implementation object diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index f2bdf85d3..b012f6752 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -306,6 +306,14 @@ AqlQueue::~AqlQueue() { core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_); } +void AqlQueue::Destroy() { + if (amd_queue_.hsa_queue.type & HSA_QUEUE_TYPE_COOPERATIVE) { + agent_->GWSRelease(); + return; + } + delete this; +} + uint64_t AqlQueue::LoadReadIndexAcquire() { return atomic::Load(&amd_queue_.read_dispatch_id, std::memory_order_acquire); } diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index de865f977..b2acd54af 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -533,7 +533,7 @@ void GpuAgent::InitDma() { auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue) { const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); - bool use_sdma = (isa_->GetMajorVersion() != 8); + bool use_sdma = ((isa_->GetMajorVersion() != 8) && (isa_->GetMajorVersion() != 10)); if (sdma_override.size() != 0) use_sdma = (sdma_override == "1"); if (use_sdma && (HSA_PROFILE_BASE == profile_)) { @@ -580,6 +580,35 @@ void GpuAgent::InitDma() { for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) { blits_[idx].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility]); }); } + + // GWS queues. + InitGWS(); +} + +void GpuAgent::InitGWS() { + gws_queue_.queue_.reset([this]() { + if (properties_.NumGws == 0) return (core::Queue*)nullptr; + std::unique_ptr queue(CreateInterceptibleQueue()); + if (queue == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + "Internal queue creation failed."); + + uint32_t discard; + auto status = hsaKmtAllocQueueGWS(queue->amd_queue_.hsa_queue.id, 1, &discard); + if (status != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "GWS allocation failed."); + + queue->amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_COOPERATIVE | HSA_QUEUE_TYPE_MULTI; + gws_queue_.ref_ct_ = 0; + return queue.release(); + }); +} + +void GpuAgent::GWSRelease() { + ScopedAcquire lock(&gws_queue_.lock_); + gws_queue_.ref_ct_--; + if (gws_queue_.ref_ct_ != 0) return; + InitGWS(); } void GpuAgent::PreloadBlits() { @@ -869,6 +898,9 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AMD_AGENT_INFO_DOMAIN: *((uint32_t*)value) = static_cast(properties_.Domain); break; + case HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES: + *((bool*)value) = properties_.NumGws != 0; + break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; @@ -881,6 +913,18 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, void* data, uint32_t private_segment_size, uint32_t group_segment_size, core::Queue** queue) { + // Handle GWS queues. + if (queue_type & HSA_QUEUE_TYPE_COOPERATIVE) { + ScopedAcquire lock(&gws_queue_.lock_); + auto ret = (*gws_queue_.queue_).get(); + if (ret != nullptr) { + gws_queue_.ref_ct_++; + *queue = ret; + return HSA_STATUS_SUCCESS; + } + return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; + } + // AQL queues must be a power of two in length. if (!IsPowerOfTwo(size)) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index 6bc2919d6..7de29f965 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -687,8 +687,8 @@ hsa_status_t hsa_queue_create( TRY; IS_OPEN(); - if ((queue == nullptr) || (size == 0) || (!IsPowerOfTwo(size)) || (type < HSA_QUEUE_TYPE_MULTI) || - (type > HSA_QUEUE_TYPE_SINGLE)) { + if ((queue == nullptr) || (size == 0) || (!IsPowerOfTwo(size)) || + (type > HSA_QUEUE_TYPE_COOPERATIVE)) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -701,7 +701,12 @@ hsa_status_t hsa_queue_create( assert(HSA_STATUS_SUCCESS == status); if (agent_queue_type == HSA_QUEUE_TYPE_SINGLE && - type != HSA_QUEUE_TYPE_SINGLE) { + ((type & HSA_QUEUE_TYPE_SINGLE) != HSA_QUEUE_TYPE_SINGLE)) { + return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; + } + + if ((type & HSA_QUEUE_TYPE_COOPERATIVE) && + ((type & HSA_QUEUE_TYPE_SINGLE) != HSA_QUEUE_TYPE_MULTI)) { return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; } @@ -758,7 +763,7 @@ hsa_status_t hsa_queue_destroy(hsa_queue_t* queue) { IS_BAD_PTR(queue); core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); - delete cmd_queue; + cmd_queue->Destroy(); return HSA_STATUS_SUCCESS; CATCH; } diff --git a/src/inc/hsa.h b/src/inc/hsa.h index 3979219a8..54dc78460 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -2193,7 +2193,19 @@ typedef enum { * that support a single producer may be more efficient than queues supporting * multiple producers. */ - HSA_QUEUE_TYPE_SINGLE = 1 + HSA_QUEUE_TYPE_SINGLE = 1, + /** + * Queue supports cooperative dispatches able to use GWS synchronization. + * Queues of this type must also be of type HSA_QUEUE_TYPE_MULTI and + * may be limited in number. The runtime may return the same queue to serve + * multiple hsa_queue_create calls when this type is given. Callers must + * inspect the returned queue to discover queue size. Queues of this type + * are reference counted and require a matching number of hsa_queue_destroy + * calls to release. Use of multiproducer queue mechanics is required. See + * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this + * type. + */ + HSA_QUEUE_TYPE_COOPERATIVE = 2 } hsa_queue_type_t; /** @@ -2300,9 +2312,9 @@ typedef struct hsa_queue_s { * created queue is the maximum of @p size and the value of * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. * - * @param[in] type Type of the queue. If the value of - * ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, then @p - * type must also be ::HSA_QUEUE_TYPE_SINGLE. + * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values. + * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, + * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE. * * @param[in] callback Callback invoked by the HSA runtime for every * asynchronous event related to the newly created queue. May be NULL. The HSA diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 62ecf16f9..b281e4632 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -158,7 +158,12 @@ typedef enum hsa_amd_agent_info_s { * to give the full physical location of the Agent. * The type of this attribute is uint32_t. */ - HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F + HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F, + /** + * Queries for support of cooperative queues. See ::HSA_QUEUE_TYPE_COOPERATIVE. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010 } hsa_amd_agent_info_t; typedef struct hsa_amd_hdp_flush_s { diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index dc5352096..0ec868cbc 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -62,7 +62,13 @@ using namespace amd::hsa; using namespace amd::hsa::common; -static void __attribute__((noinline, optimize(0))) _loader_debug_state() {}; +#if defined __clang__ +#define NONOPTIMIZE __attribute__((noinline, optnone)) +#else +#define NONOPTIMIZE __attribute__((noinline, optimize(0))) +#endif + +NONOPTIMIZE static void _loader_debug_state() {}; r_debug _amdgpu_r_debug __attribute__((visibility("default"))) = {1, nullptr, reinterpret_cast(&_loader_debug_state),