ROCm 3.3.0 updates

ROCm · Apr 1, 2020 · 1a56c09 · 1a56c09
1 parent 3eed1b4
commit 1a56c09
Show file tree

Hide file tree

Showing 22 changed files with 132 additions and 106 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -103,8 +103,6 @@ link_directories ( ${HSAKMT_LIB_PATH} )
 
 ## Set include directories for ROCr runtime
 include_directories ( ${CMAKE_CURRENT_SOURCE_DIR} )
-include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/inc )
-include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/core/inc )
 include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode )
 
 ## ROCr build internal versioning

diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp
@@ -40,7 +40,7 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-#include "hsa_api_trace.h"
+#include "inc/hsa_api_trace.h"
 #include "core/inc/hsa_api_trace_int.h"
 
 static const HsaApiTable* hsaApiTable;

diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
@@ -71,6 +71,8 @@ struct ScratchInfo {
   ptrdiff_t queue_process_offset;
   bool large;
   bool retry;
+  hsa_signal_t queue_retry;
+  uint64_t wanted_slots;
 };
 
 // @brief Interface to represent a GPU agent.
@@ -265,22 +267,8 @@ class GpuAgent : public GpuAgentInt {
   // @brief Override from amd::GpuAgentInt.
   void ReleaseQueueScratch(ScratchInfo& scratch) override;
 
-  // @brief Register signal for notification when scratch may become available.
-  // @p signal is notified by OR'ing with @p value.
-  void AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) {
-    ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-    scratch_notifiers_[signal] = value;
-  }
-
-  // @brief Deregister scratch notification signal.
-  void RemoveScratchNotifier(hsa_signal_t signal) {
-    ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-    scratch_notifiers_.erase(signal);
-  }
-
   // @brief Override from amd::GpuAgentInt.
-  void TranslateTime(core::Signal* signal,
-                     hsa_amd_profiling_dispatch_time_t& time) override;
+  void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override;
 
   // @brief Override from amd::GpuAgentInt.
   void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) override;
@@ -494,6 +482,16 @@ class GpuAgent : public GpuAgentInt {
   // @brief Setup GWS accessing queue.
   void InitGWS();
 
+  // @brief Register signal for notification when scratch may become available.
+  // @p signal is notified by OR'ing with @p value.
+  bool AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) {
+    if (signal.handle != 0) return false;
+    scratch_notifiers_[signal] = value;
+    return true;
+  }
+
+  // @brief Deregister scratch notification signals.
+  void ClearScratchNotifiers() { scratch_notifiers_.clear(); }
 
   // Bind index of peer device that is connected via xGMI links
   lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);

diff --git a/src/core/inc/amd_hsa_code.hpp b/src/core/inc/amd_hsa_code.hpp
@@ -82,11 +82,11 @@ mes of its
 #ifndef AMD_HSA_CODE_HPP_
 #define AMD_HSA_CODE_HPP_
 
-#include "amd_elf_image.hpp"
-#include "amd_hsa_elf.h"
-#include "amd_hsa_kernel_code.h"
-#include "hsa.h"
-#include "hsa_ext_finalize.h"
+#include "core/inc/amd_elf_image.hpp"
+#include "inc/amd_hsa_elf.h"
+#include "inc/amd_hsa_kernel_code.h"
+#include "inc/hsa.h"
+#include "inc/hsa_ext_finalize.h"
 #include <memory>
 #include <sstream>
 #include <cassert>

diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp
@@ -45,10 +45,10 @@
 
 #include <cstddef>
 #include <cstdint>
-#include "hsa.h"
-#include "hsa_ext_image.h"
-#include "hsa_ven_amd_loader.h"
-#include "amd_hsa_elf.h"
+#include "inc/hsa.h"
+#include "inc/hsa_ext_image.h"
+#include "inc/hsa_ven_amd_loader.h"
+#include "inc/amd_hsa_elf.h"
 #include <string>
 #include <mutex>
 #include <vector>

diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h
@@ -45,9 +45,9 @@
 #ifndef HSA_RUNTIME_CORE_INC_EXT_AMD_H_
 #define HSA_RUNTIME_CORE_INC_EXT_AMD_H_
 
-#include "hsa.h"
-#include "hsa_ext_image.h"
-#include "hsa_ext_amd.h"
+#include "inc/hsa.h"
+#include "inc/hsa_ext_image.h"
+#include "inc/hsa_ext_amd.h"
 
 // Wrap internal implementation inside AMD namespace
 namespace AMD {

diff --git a/src/core/inc/hsa_ext_interface.h b/src/core/inc/hsa_ext_interface.h
@@ -46,7 +46,7 @@
 #include <string>
 #include <vector>
 
-#include "hsa_api_trace_int.h"
+#include "core/inc/hsa_api_trace_int.h"
 
 #include "core/util/os.h"
 #include "core/util/utils.h"

diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
@@ -66,7 +66,7 @@
 #include "core/util/utils.h"
 
 #include "core/inc/amd_loader_context.hpp"
-#include "amd_hsa_code.hpp"
+#include "core/inc/amd_hsa_code.hpp"
 
 //---------------------------------------------------------------------------//
 //    Constants                                                              //

diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
@@ -189,9 +189,6 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
                    0);
 #endif
 
-  // Initialize scratch memory related entities
-  InitScratchSRD();
-
   // Set group and private memory apertures in amd_queue_.
   auto& regions = agent->regions();
 
@@ -257,6 +254,11 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
     assert(Signal != nullptr && "Should have thrown!\n");
     amd_queue_.queue_inactive_signal = core::DefaultSignal::Convert(Signal);
   }
+
+  // Initialize scratch memory related entities
+  queue_scratch_.queue_retry = amd_queue_.queue_inactive_signal;
+  InitScratchSRD();
+
   if (AMD::hsa_amd_signal_async_handler(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE,
                                         0, DynamicScratchHandler, this) != HSA_STATUS_SUCCESS)
     throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
@@ -729,7 +731,6 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
 
   if ((queue->dynamicScratchState & ERROR_HANDLER_SCRATCH_RETRY) == ERROR_HANDLER_SCRATCH_RETRY) {
     queue->dynamicScratchState &= ~ERROR_HANDLER_SCRATCH_RETRY;
-    queue->agent_->RemoveScratchNotifier(queue->amd_queue_.queue_inactive_signal);
     changeWait = true;
     waitVal = 0;
     HSA::hsa_signal_and_relaxed(queue->amd_queue_.queue_inactive_signal, ~0x8000000000000000ull);
@@ -771,18 +772,23 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
 
       uint32_t scratch_request = pkt.dispatch.private_segment_size;
 
+      const uint32_t MaxScratchSlots =
+          (queue->amd_queue_.max_cu_id + 1) * queue->agent_->properties().MaxSlotsScratchCU;
+
       scratch.size_per_thread = scratch_request;
       scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
       // Align whole waves to 1KB.
       scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
-      scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
-          queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;
+      scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
+#ifndef NDEBUG
+      scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) *
+                              pkt.dispatch.grid_size_z) / scratch.lanes_per_wave;
+      scratch.wanted_slots = Min(scratch.wanted_slots, MaxScratchSlots);
+#endif
 
       queue->agent_->AcquireQueueScratch(scratch);
 
       if (scratch.retry) {
-        queue->agent_->AddScratchNotifier(queue->amd_queue_.queue_inactive_signal,
-                                          0x8000000000000000ull);
         queue->dynamicScratchState |= ERROR_HANDLER_SCRATCH_RETRY;
         changeWait = true;
         waitVal = error_code;

diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp
@@ -48,7 +48,7 @@
 #include "core/inc/amd_memory_region.h"
 #include "core/inc/host_queue.h"
 
-#include "hsa_ext_image.h"
+#include "inc/hsa_ext_image.h"
 
 namespace amd {
 CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
@@ -71,14 +71,12 @@ void CpuAgent::InitRegionList() {
       hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
                                     &mem_props[0])) {
     std::vector<HsaMemoryProperties>::iterator system_prop =
-        std::find_if(mem_props.begin(), mem_props.end(),
-                     [](HsaMemoryProperties prop) -> bool {
+        std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool {
           return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM);
         });
 
     if (system_prop != mem_props.end()) {
-      MemoryRegion* system_region_fine =
-          new MemoryRegion(true, is_apu_node, this, *system_prop);
+      MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop);
 
       regions_.push_back(system_region_fine);
 
@@ -92,18 +90,18 @@ void CpuAgent::InitRegionList() {
       HsaMemoryProperties system_props;
       std::memset(&system_props, 0, sizeof(HsaMemoryProperties));
 
-      const uintptr_t system_base = os::GetUserModeVirtualMemoryBase();
-      const size_t system_physical_size = os::GetUsablePhysicalHostMemorySize();
-      assert(system_physical_size != 0);
-
       system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
-      system_props.SizeInBytes = (HSAuint64)system_physical_size;
-      system_props.VirtualBaseAddress = (HSAuint64)(system_base);
+      system_props.SizeInBytes = 0;
+      system_props.VirtualBaseAddress = 0;
 
-      MemoryRegion* system_region =
-          new MemoryRegion(true, is_apu_node, this, system_props);
+      MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props);
+      regions_.push_back(system_region_fine);
 
-      regions_.push_back(system_region);
+      if (!is_apu_node) {
+        MemoryRegion* system_region_coarse =
+            new MemoryRegion(false, is_apu_node, this, system_props);
+        regions_.push_back(system_region_coarse);
+      }
     }
   }
 }

diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
@@ -77,6 +77,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
     : GpuAgentInt(node),
       properties_(node_props),
       current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
+      scratch_used_large_(0),
       queues_(),
       local_region_(NULL),
       is_kv_device_(false),
@@ -323,11 +324,9 @@ void GpuAgent::InitRegionList() {
           }
           break;
         case HSA_HEAPTYPE_MMIO_REMAP:
-          if (core::Runtime::runtime_singleton_->flag().fine_grain_pcie()) {
-            // Remap offsets defined in kfd_ioctl.h
-            HDP_flush_.HDP_MEM_FLUSH_CNTL = (uint32_t*)mem_props[mem_idx].VirtualBaseAddress;
-            HDP_flush_.HDP_REG_FLUSH_CNTL = HDP_flush_.HDP_MEM_FLUSH_CNTL + 1;
-          }
+          // Remap offsets defined in kfd_ioctl.h
+          HDP_flush_.HDP_MEM_FLUSH_CNTL = (uint32_t*)mem_props[mem_idx].VirtualBaseAddress;
+          HDP_flush_.HDP_REG_FLUSH_CNTL = HDP_flush_.HDP_MEM_FLUSH_CNTL + 1;
           break;
         default:
           continue;
@@ -936,7 +935,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
   }
 
   // Allocate scratch memory
-  ScratchInfo scratch;
+  ScratchInfo scratch = {0};
   if (private_segment_size == UINT_MAX) {
     private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
   }
@@ -1038,15 +1037,19 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
 
   // Retry if large may yield needed space.
   if (scratch_used_large_ != 0) {
-    scratch.retry = true;
+    if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
     return;
   }
 
+  // Fail scratch allocation if reducing occupancy is disabled.
+  if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return;
+
   // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
   if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-    debug_print("Failed to map requested scratch - reducing queue occupancy.\n");
-  uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  uint64_t total_waves = scratch.size / size_per_wave;
+    debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
+                scratch.size);
+  const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
+  const uint64_t total_waves = scratch.size / size_per_wave;
   uint64_t waves_per_cu = total_waves / num_cus;
   while (waves_per_cu != 0) {
     size_t size = waves_per_cu * num_cus * size_per_wave;
@@ -1058,12 +1061,14 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
       // Scratch allocated and either full profile or map succeeded.
       scratch.queue_base = base;
       scratch.size = size;
-      scratch.queue_process_offset =
-          (need_queue_scratch_base)
-              ? uintptr_t(scratch.queue_base)
-              : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
+      scratch.queue_process_offset = (need_queue_scratch_base)
+          ? uintptr_t(scratch.queue_base)
+          : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
       scratch.large = true;
       scratch_used_large_ += scratch.size;
+      if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
+        debug_print("  %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
+                    float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
       return;
     }
     scratch_pool_.free(base);
@@ -1073,7 +1078,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
   // Failed to allocate minimal scratch
   assert(scratch.queue_base == nullptr && "bad scratch data");
   if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
-    debug_print("Could not allocate scratch for one wave per CU.\n");
+    debug_print("  Could not allocate scratch for one wave per CU.\n");
 }
 
 void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
@@ -1093,12 +1098,13 @@ void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
   if (scratch.large) scratch_used_large_ -= scratch.size;
 
   // Notify waiters that additional scratch may be available.
-  for (auto notifier : scratch_notifiers_)
+  for (auto notifier : scratch_notifiers_) {
     HSA::hsa_signal_or_relaxed(notifier.first, notifier.second);
+  }
+  ClearScratchNotifiers();
 }
 
-void GpuAgent::TranslateTime(core::Signal* signal,
-                             hsa_amd_profiling_dispatch_time_t& time) {
+void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) {
   uint64_t start, end;
   signal->GetRawTs(false, start, end);
   // Order is important, we want to translate the end time first to ensure that packet duration is

diff --git a/src/core/runtime/hsa_ven_amd_loader.cpp b/src/core/runtime/hsa_ven_amd_loader.cpp
@@ -40,7 +40,7 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-#include "hsa_ven_amd_loader.h"
+#include "inc/hsa_ven_amd_loader.h"
 
 #include "core/inc/amd_hsa_loader.hpp"
 #include "core/inc/runtime.h"

diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
@@ -758,7 +758,13 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
   {  // memory_lock protects access to the NMappedNodes array and fragment user data since these may
      // change with calls to memory APIs.
     ScopedAcquire<KernelMutex> lock(&memory_lock_);
-    hsaKmtQueryPointerInfo(ptr, &thunkInfo);
+
+    // We don't care if this returns an error code.
+    // The type will be HSA_EXT_POINTER_TYPE_UNKNOWN if so.
+    auto err = hsaKmtQueryPointerInfo(ptr, &thunkInfo);
+    assert(((err == HSAKMT_STATUS_SUCCESS) || (thunkInfo.Type == HSA_POINTER_UNKNOWN)) &&
+           "Thunk ptr info error and not type HSA_POINTER_UNKNOWN.");
+
     if (returnListData) {
       assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
              "PointerInfo: Thunk returned more than all agents in NMappedNodes.");
@@ -798,8 +804,8 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
 
   retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
 
-  // Temp: workaround thunk bug, IPC memory has garbage in Node.
-  // retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
+  // IPC and Graphics memory may come from a node that does not have an agent in this process.
+  // Ex. ROCR_VISIBLE_DEVICES or peer GPU is not supported by ROCm.
   auto nodeAgents = agents_by_node_.find(thunkInfo.Node);
   if (nodeAgents != agents_by_node_.end())
     retInfo.agentOwner = nodeAgents->second[0]->public_handle();