Skip to content

Commit

Permalink
ROCm 3.3.0 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
skeelyamd committed Apr 1, 2020
1 parent 3eed1b4 commit 1a56c09
Show file tree
Hide file tree
Showing 22 changed files with 132 additions and 106 deletions.
2 changes: 0 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,6 @@ link_directories ( ${HSAKMT_LIB_PATH} )

## Set include directories for ROCr runtime
include_directories ( ${CMAKE_CURRENT_SOURCE_DIR} )
include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/inc )
include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/core/inc )
include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode )

## ROCr build internal versioning
Expand Down
2 changes: 1 addition & 1 deletion src/core/common/hsa_table_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
//
////////////////////////////////////////////////////////////////////////////////

#include "hsa_api_trace.h"
#include "inc/hsa_api_trace.h"
#include "core/inc/hsa_api_trace_int.h"

static const HsaApiTable* hsaApiTable;
Expand Down
28 changes: 13 additions & 15 deletions src/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ struct ScratchInfo {
ptrdiff_t queue_process_offset;
bool large;
bool retry;
hsa_signal_t queue_retry;
uint64_t wanted_slots;
};

// @brief Interface to represent a GPU agent.
Expand Down Expand Up @@ -265,22 +267,8 @@ class GpuAgent : public GpuAgentInt {
// @brief Override from amd::GpuAgentInt.
void ReleaseQueueScratch(ScratchInfo& scratch) override;

// @brief Register signal for notification when scratch may become available.
// @p signal is notified by OR'ing with @p value.
void AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) {
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
scratch_notifiers_[signal] = value;
}

// @brief Deregister scratch notification signal.
void RemoveScratchNotifier(hsa_signal_t signal) {
ScopedAcquire<KernelMutex> lock(&scratch_lock_);
scratch_notifiers_.erase(signal);
}

// @brief Override from amd::GpuAgentInt.
void TranslateTime(core::Signal* signal,
hsa_amd_profiling_dispatch_time_t& time) override;
void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override;

// @brief Override from amd::GpuAgentInt.
void TranslateTime(core::Signal* signal, hsa_amd_profiling_async_copy_time_t& time) override;
Expand Down Expand Up @@ -494,6 +482,16 @@ class GpuAgent : public GpuAgentInt {
// @brief Setup GWS accessing queue.
void InitGWS();

// @brief Register signal for notification when scratch may become available.
// @p signal is notified by OR'ing with @p value.
bool AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) {
if (signal.handle != 0) return false;
scratch_notifiers_[signal] = value;
return true;
}

// @brief Deregister scratch notification signals.
void ClearScratchNotifiers() { scratch_notifiers_.clear(); }

// Bind index of peer device that is connected via xGMI links
lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
Expand Down
10 changes: 5 additions & 5 deletions src/core/inc/amd_hsa_code.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ mes of its
#ifndef AMD_HSA_CODE_HPP_
#define AMD_HSA_CODE_HPP_

#include "amd_elf_image.hpp"
#include "amd_hsa_elf.h"
#include "amd_hsa_kernel_code.h"
#include "hsa.h"
#include "hsa_ext_finalize.h"
#include "core/inc/amd_elf_image.hpp"
#include "inc/amd_hsa_elf.h"
#include "inc/amd_hsa_kernel_code.h"
#include "inc/hsa.h"
#include "inc/hsa_ext_finalize.h"
#include <memory>
#include <sstream>
#include <cassert>
Expand Down
8 changes: 4 additions & 4 deletions src/core/inc/amd_hsa_loader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@

#include <cstddef>
#include <cstdint>
#include "hsa.h"
#include "hsa_ext_image.h"
#include "hsa_ven_amd_loader.h"
#include "amd_hsa_elf.h"
#include "inc/hsa.h"
#include "inc/hsa_ext_image.h"
#include "inc/hsa_ven_amd_loader.h"
#include "inc/amd_hsa_elf.h"
#include <string>
#include <mutex>
#include <vector>
Expand Down
6 changes: 3 additions & 3 deletions src/core/inc/hsa_ext_amd_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@
#ifndef HSA_RUNTIME_CORE_INC_EXT_AMD_H_
#define HSA_RUNTIME_CORE_INC_EXT_AMD_H_

#include "hsa.h"
#include "hsa_ext_image.h"
#include "hsa_ext_amd.h"
#include "inc/hsa.h"
#include "inc/hsa_ext_image.h"
#include "inc/hsa_ext_amd.h"

// Wrap internal implementation inside AMD namespace
namespace AMD {
Expand Down
2 changes: 1 addition & 1 deletion src/core/inc/hsa_ext_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
#include <string>
#include <vector>

#include "hsa_api_trace_int.h"
#include "core/inc/hsa_api_trace_int.h"

#include "core/util/os.h"
#include "core/util/utils.h"
Expand Down
2 changes: 1 addition & 1 deletion src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
#include "core/util/utils.h"

#include "core/inc/amd_loader_context.hpp"
#include "amd_hsa_code.hpp"
#include "core/inc/amd_hsa_code.hpp"

//---------------------------------------------------------------------------//
// Constants //
Expand Down
22 changes: 14 additions & 8 deletions src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,6 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
0);
#endif

// Initialize scratch memory related entities
InitScratchSRD();

// Set group and private memory apertures in amd_queue_.
auto& regions = agent->regions();

Expand Down Expand Up @@ -257,6 +254,11 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
assert(Signal != nullptr && "Should have thrown!\n");
amd_queue_.queue_inactive_signal = core::DefaultSignal::Convert(Signal);
}

// Initialize scratch memory related entities
queue_scratch_.queue_retry = amd_queue_.queue_inactive_signal;
InitScratchSRD();

if (AMD::hsa_amd_signal_async_handler(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE,
0, DynamicScratchHandler, this) != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
Expand Down Expand Up @@ -729,7 +731,6 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {

if ((queue->dynamicScratchState & ERROR_HANDLER_SCRATCH_RETRY) == ERROR_HANDLER_SCRATCH_RETRY) {
queue->dynamicScratchState &= ~ERROR_HANDLER_SCRATCH_RETRY;
queue->agent_->RemoveScratchNotifier(queue->amd_queue_.queue_inactive_signal);
changeWait = true;
waitVal = 0;
HSA::hsa_signal_and_relaxed(queue->amd_queue_.queue_inactive_signal, ~0x8000000000000000ull);
Expand Down Expand Up @@ -771,18 +772,23 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {

uint32_t scratch_request = pkt.dispatch.private_segment_size;

const uint32_t MaxScratchSlots =
(queue->amd_queue_.max_cu_id + 1) * queue->agent_->properties().MaxSlotsScratchCU;

scratch.size_per_thread = scratch_request;
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
// Align whole waves to 1KB.
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;
scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
#ifndef NDEBUG
scratch.wanted_slots = ((uint64_t(pkt.dispatch.grid_size_x) * pkt.dispatch.grid_size_y) *
pkt.dispatch.grid_size_z) / scratch.lanes_per_wave;
scratch.wanted_slots = Min(scratch.wanted_slots, MaxScratchSlots);
#endif

queue->agent_->AcquireQueueScratch(scratch);

if (scratch.retry) {
queue->agent_->AddScratchNotifier(queue->amd_queue_.queue_inactive_signal,
0x8000000000000000ull);
queue->dynamicScratchState |= ERROR_HANDLER_SCRATCH_RETRY;
changeWait = true;
waitVal = error_code;
Expand Down
26 changes: 12 additions & 14 deletions src/core/runtime/amd_cpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
#include "core/inc/amd_memory_region.h"
#include "core/inc/host_queue.h"

#include "hsa_ext_image.h"
#include "inc/hsa_ext_image.h"

namespace amd {
CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
Expand All @@ -71,14 +71,12 @@ void CpuAgent::InitRegionList() {
hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks,
&mem_props[0])) {
std::vector<HsaMemoryProperties>::iterator system_prop =
std::find_if(mem_props.begin(), mem_props.end(),
[](HsaMemoryProperties prop) -> bool {
std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool {
return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM);
});

if (system_prop != mem_props.end()) {
MemoryRegion* system_region_fine =
new MemoryRegion(true, is_apu_node, this, *system_prop);
MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop);

regions_.push_back(system_region_fine);

Expand All @@ -92,18 +90,18 @@ void CpuAgent::InitRegionList() {
HsaMemoryProperties system_props;
std::memset(&system_props, 0, sizeof(HsaMemoryProperties));

const uintptr_t system_base = os::GetUserModeVirtualMemoryBase();
const size_t system_physical_size = os::GetUsablePhysicalHostMemorySize();
assert(system_physical_size != 0);

system_props.HeapType = HSA_HEAPTYPE_SYSTEM;
system_props.SizeInBytes = (HSAuint64)system_physical_size;
system_props.VirtualBaseAddress = (HSAuint64)(system_base);
system_props.SizeInBytes = 0;
system_props.VirtualBaseAddress = 0;

MemoryRegion* system_region =
new MemoryRegion(true, is_apu_node, this, system_props);
MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props);
regions_.push_back(system_region_fine);

regions_.push_back(system_region);
if (!is_apu_node) {
MemoryRegion* system_region_coarse =
new MemoryRegion(false, is_apu_node, this, system_props);
regions_.push_back(system_region_coarse);
}
}
}
}
Expand Down
42 changes: 24 additions & 18 deletions src/core/runtime/amd_gpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
: GpuAgentInt(node),
properties_(node_props),
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
scratch_used_large_(0),
queues_(),
local_region_(NULL),
is_kv_device_(false),
Expand Down Expand Up @@ -323,11 +324,9 @@ void GpuAgent::InitRegionList() {
}
break;
case HSA_HEAPTYPE_MMIO_REMAP:
if (core::Runtime::runtime_singleton_->flag().fine_grain_pcie()) {
// Remap offsets defined in kfd_ioctl.h
HDP_flush_.HDP_MEM_FLUSH_CNTL = (uint32_t*)mem_props[mem_idx].VirtualBaseAddress;
HDP_flush_.HDP_REG_FLUSH_CNTL = HDP_flush_.HDP_MEM_FLUSH_CNTL + 1;
}
// Remap offsets defined in kfd_ioctl.h
HDP_flush_.HDP_MEM_FLUSH_CNTL = (uint32_t*)mem_props[mem_idx].VirtualBaseAddress;
HDP_flush_.HDP_REG_FLUSH_CNTL = HDP_flush_.HDP_MEM_FLUSH_CNTL + 1;
break;
default:
continue;
Expand Down Expand Up @@ -936,7 +935,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
}

// Allocate scratch memory
ScratchInfo scratch;
ScratchInfo scratch = {0};
if (private_segment_size == UINT_MAX) {
private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
}
Expand Down Expand Up @@ -1038,15 +1037,19 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {

// Retry if large may yield needed space.
if (scratch_used_large_ != 0) {
scratch.retry = true;
if (AddScratchNotifier(scratch.queue_retry, 0x8000000000000000ull)) scratch.retry = true;
return;
}

// Fail scratch allocation if reducing occupancy is disabled.
if (core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter()) return;

// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print("Failed to map requested scratch - reducing queue occupancy.\n");
uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
uint64_t total_waves = scratch.size / size_per_wave;
debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
scratch.size);
const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
const uint64_t total_waves = scratch.size / size_per_wave;
uint64_t waves_per_cu = total_waves / num_cus;
while (waves_per_cu != 0) {
size_t size = waves_per_cu * num_cus * size_per_wave;
Expand All @@ -1058,12 +1061,14 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
// Scratch allocated and either full profile or map succeeded.
scratch.queue_base = base;
scratch.size = size;
scratch.queue_process_offset =
(need_queue_scratch_base)
? uintptr_t(scratch.queue_base)
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
scratch.queue_process_offset = (need_queue_scratch_base)
? uintptr_t(scratch.queue_base)
: uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
scratch.large = true;
scratch_used_large_ += scratch.size;
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print(" %ld scratch mapped, %.2f%% occupancy.\n", scratch.size,
float(waves_per_cu * num_cus) / scratch.wanted_slots * 100.0f);
return;
}
scratch_pool_.free(base);
Expand All @@ -1073,7 +1078,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
// Failed to allocate minimal scratch
assert(scratch.queue_base == nullptr && "bad scratch data");
if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
debug_print("Could not allocate scratch for one wave per CU.\n");
debug_print(" Could not allocate scratch for one wave per CU.\n");
}

void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
Expand All @@ -1093,12 +1098,13 @@ void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
if (scratch.large) scratch_used_large_ -= scratch.size;

// Notify waiters that additional scratch may be available.
for (auto notifier : scratch_notifiers_)
for (auto notifier : scratch_notifiers_) {
HSA::hsa_signal_or_relaxed(notifier.first, notifier.second);
}
ClearScratchNotifiers();
}

void GpuAgent::TranslateTime(core::Signal* signal,
hsa_amd_profiling_dispatch_time_t& time) {
void GpuAgent::TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) {
uint64_t start, end;
signal->GetRawTs(false, start, end);
// Order is important, we want to translate the end time first to ensure that packet duration is
Expand Down
2 changes: 1 addition & 1 deletion src/core/runtime/hsa_ven_amd_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
//
////////////////////////////////////////////////////////////////////////////////

#include "hsa_ven_amd_loader.h"
#include "inc/hsa_ven_amd_loader.h"

#include "core/inc/amd_hsa_loader.hpp"
#include "core/inc/runtime.h"
Expand Down
12 changes: 9 additions & 3 deletions src/core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,13 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
{ // memory_lock protects access to the NMappedNodes array and fragment user data since these may
// change with calls to memory APIs.
ScopedAcquire<KernelMutex> lock(&memory_lock_);
hsaKmtQueryPointerInfo(ptr, &thunkInfo);

// We don't care if this returns an error code.
// The type will be HSA_EXT_POINTER_TYPE_UNKNOWN if so.
auto err = hsaKmtQueryPointerInfo(ptr, &thunkInfo);
assert(((err == HSAKMT_STATUS_SUCCESS) || (thunkInfo.Type == HSA_POINTER_UNKNOWN)) &&
"Thunk ptr info error and not type HSA_POINTER_UNKNOWN.");

if (returnListData) {
assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
"PointerInfo: Thunk returned more than all agents in NMappedNodes.");
Expand Down Expand Up @@ -798,8 +804,8 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a

retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));

// Temp: workaround thunk bug, IPC memory has garbage in Node.
// retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
// IPC and Graphics memory may come from a node that does not have an agent in this process.
// Ex. ROCR_VISIBLE_DEVICES or peer GPU is not supported by ROCm.
auto nodeAgents = agents_by_node_.find(thunkInfo.Node);
if (nodeAgents != agents_by_node_.end())
retInfo.agentOwner = nodeAgents->second[0]->public_handle();
Expand Down
Loading

0 comments on commit 1a56c09

Please sign in to comment.