Skip to content

Commit

Permalink
Merge pull request #120 from RadeonOpenCompute/rocm-4.2.x
Browse files Browse the repository at this point in the history
ROCm 4.2.0 updates
  • Loading branch information
skeelyamd authored May 10, 2021
2 parents 7758972 + 337e3e5 commit ea47a79
Show file tree
Hide file tree
Showing 23 changed files with 425 additions and 138 deletions.
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ if (ROCM_CCACHE_BUILD)
endif() # if (ROCM_CCACHE_BUILD)

## Get version strings
get_version ( "1.2.0" )
get_version ( "1.3.0" )
if ( ${ROCM_PATCH_VERSION} )
set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
endif()
Expand Down
6 changes: 6 additions & 0 deletions src/core/common/hsa_table_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1145,6 +1145,12 @@ hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
return amdExtTable->hsa_amd_deregister_deallocation_callback_fn(ptr, callback);
}

// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal,
volatile hsa_signal_value_t** value_ptr) {
return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr);
}

// Tools only table interfaces.
namespace rocr {

Expand Down
240 changes: 155 additions & 85 deletions src/core/inc/amd_gpu_shaders.h

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/core/inc/hsa_ext_amd_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,10 @@ hsa_status_t hsa_amd_register_deallocation_callback(
hsa_status_t hsa_amd_deregister_deallocation_callback(
void* ptr, hsa_amd_deallocation_callback_t callback);

// Mirrors Amd Extension Apis
hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
volatile hsa_signal_value_t** value_ptr);

} // namespace amd
} // namespace rocr

Expand Down
12 changes: 9 additions & 3 deletions src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ class Runtime {

amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; }

std::function<void*(size_t, size_t, MemoryRegion::AllocateFlags)>&
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)>&
system_allocator() {
return system_allocator_;
}
Expand All @@ -341,6 +341,10 @@ class Runtime {

uint64_t sys_clock_freq() const { return sys_clock_freq_; }

void KfdVersion(const HsaVersionInfo& version) { kfd_version = version; }

HsaVersionInfo KfdVersion() const { return kfd_version; }

protected:
static void AsyncEventsLoop(void*);

Expand Down Expand Up @@ -482,8 +486,7 @@ class Runtime {
std::map<const void*, AllocationRegion> allocation_map_;

// Allocator using ::system_region_
std::function<void*(size_t, size_t, MemoryRegion::AllocateFlags)>
system_allocator_;
std::function<void*(size_t size, size_t align, MemoryRegion::AllocateFlags flags)> system_allocator_;

// Deallocator using ::system_region_
std::function<void(void*)> system_deallocator_;
Expand Down Expand Up @@ -533,6 +536,9 @@ class Runtime {
// Pools KFD Events for InterruptSignal
InterruptSignal::EventPool EventPool;

// Kfd version
HsaVersionInfo kfd_version;

// Frees runtime memory when the runtime library is unloaded if safe to do so.
// Failure to release the runtime indicates an incorrect application but is
// common (example: calls library routines at process exit).
Expand Down
28 changes: 17 additions & 11 deletions src/core/runtime/amd_gpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
ASICShader compute_7;
ASICShader compute_8;
ASICShader compute_9;
ASICShader compute_1010;
ASICShader compute_10;
};

Expand All @@ -211,6 +212,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
{NULL, 0, 0, 0},
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},
}},
{"CopyAligned",
Expand All @@ -219,20 +221,23 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
}},
{"CopyMisaligned",
{
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
}},
{"Fill",
{
{kCodeFill7, sizeof(kCodeFill7), 19, 8},
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
{kCodeFill8, sizeof(kCodeFill8), 19, 8},
{kCodeFill10, sizeof(kCodeFill10), 19, 8},
{kCodeFill10, sizeof(kCodeFill10), 19, 8},
}}};

auto compiled_shader_it = compiled_shaders.find(func_name);
Expand All @@ -249,10 +254,13 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
asic_shader = &compiled_shader_it->second.compute_8;
break;
case 9:
asic_shader = &compiled_shader_it->second.compute_9;
asic_shader = &compiled_shader_it->second.compute_9;
break;
case 10:
asic_shader = &compiled_shader_it->second.compute_10;
if(isa_->GetMinorVersion() == 1)
asic_shader = &compiled_shader_it->second.compute_1010;
else
asic_shader = &compiled_shader_it->second.compute_10;
break;
default:
assert(false && "Precompiled shader unavailable for target");
Expand Down Expand Up @@ -1128,15 +1136,13 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
assert(((!scratch.large) | use_reclaim) && "Large scratch used with reclaim disabled.");

if (scratch.queue_base != nullptr) {
if (profile_ == HSA_PROFILE_FULL) return;
if (profile_ == HSA_PROFILE_BASE) {
HSAuint64 alternate_va;
if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
HSAKMT_STATUS_SUCCESS) {
if (scratch.large) scratch_used_large_ += scratch.size;
scratch_cache_.insert(scratch);
return;
}
HSAuint64 alternate_va;
if ((profile_ == HSA_PROFILE_FULL) ||
(hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
HSAKMT_STATUS_SUCCESS)) {
if (scratch.large) scratch_used_large_ += scratch.size;
scratch_cache_.insert(scratch);
return;
}
}

Expand Down
9 changes: 7 additions & 2 deletions src/core/runtime/amd_topology.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ namespace AMD {
// Minimum acceptable KFD version numbers
static const uint kKfdVersionMajor = 0;
static const uint kKfdVersionMinor = 99;
static HsaVersionInfo kfd_version;

CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
if (node_prop.NumCPUCores == 0) {
Expand All @@ -89,7 +88,10 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
try {
gpu = new GpuAgent(node_id, node_prop);

// Check for sramecc incompatibility in gfx906 and gfx908. sramecc bit fixed in kfd 1.4.
const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion();

// Check for sramecc incompatibility due to sramecc not being reported correctly in kfd before
// 1.4.
if (gpu->isa()->IsSrameccSupported() && (kfd_version.KernelInterfaceMajorVersion <= 1 &&
kfd_version.KernelInterfaceMinorVersion < 4)) {
// gfx906 has both sramecc modes in use. Suppress the device.
Expand Down Expand Up @@ -226,6 +228,7 @@ static void SurfaceGpuList(std::vector<int32_t>& gpu_list) {
/// @brief Calls Kfd thunk to get the snapshot of the topology of the system,
/// which includes associations between, node, devices, memory and caches.
void BuildTopology() {
HsaVersionInfo kfd_version;
if (hsaKmtGetVersion(&kfd_version) != HSAKMT_STATUS_SUCCESS) {
return;
}
Expand All @@ -241,6 +244,8 @@ void BuildTopology() {
core::g_use_interrupt_wait = false;
}

core::Runtime::runtime_singleton_->KfdVersion(kfd_version);

HsaSystemProperties props;
hsaKmtReleaseSystemProperties();

Expand Down
1 change: 1 addition & 0 deletions src/core/runtime/hsa_api_trace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ void HsaApiTable::UpdateAmdExts() {
amd_ext_api.hsa_amd_memory_lock_to_pool_fn = AMD::hsa_amd_memory_lock_to_pool;
amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback;
amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback;
amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer;
}

void LoadInitialHsaApiTable() {
Expand Down
17 changes: 17 additions & 0 deletions src/core/runtime/hsa_ext_amd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,23 @@ hsa_status_t hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t nu
CATCH;
}

hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t hsa_signal,
volatile hsa_signal_value_t** value_ptr) {
TRY;
IS_OPEN();
IS_BAD_PTR(value_ptr);
core::Signal* signal = core::Signal::Convert(hsa_signal);
IS_VALID(signal);

if(!core::BusyWaitSignal::IsType(signal))
return HSA_STATUS_ERROR_INVALID_ARGUMENT;

*value_ptr = (volatile hsa_signal_value_t*)&signal->signal_.value;
return HSA_STATUS_SUCCESS;

CATCH;
}

uint32_t hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* hsa_signals,
hsa_signal_condition_t* conds, hsa_signal_value_t* values,
uint64_t timeout_hint, hsa_wait_state_t wait_hint,
Expand Down
1 change: 1 addition & 0 deletions src/core/runtime/isa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ constexpr size_t hsa_name_size = 63;
ISAREG_ENTRY_GEN("gfx1030", 10, 3, 0, unsupported, unsupported)
ISAREG_ENTRY_GEN("gfx1031", 10, 3, 1, unsupported, unsupported)
ISAREG_ENTRY_GEN("gfx1032", 10, 3, 2, unsupported, unsupported)
ISAREG_ENTRY_GEN("gfx1033", 10, 3, 3, unsupported, unsupported)
#undef ISAREG_ENTRY_GEN
return supported_isas;
}
Expand Down
30 changes: 12 additions & 18 deletions src/core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,20 +173,14 @@ void Runtime::RegisterAgent(Agent* agent) {
if (cpu_agents_.size() == 1) {
// Might need memory pooling to cover allocation that
// requires less than 4096 bytes.
system_allocator_ =
[&](size_t size, size_t alignment,
MemoryRegion::AllocateFlags alloc_flags) -> void* {
assert(alignment <= 4096);
void* ptr = NULL;
return (HSA_STATUS_SUCCESS ==
core::Runtime::runtime_singleton_->AllocateMemory(
system_regions_fine_[0], size, alloc_flags, &ptr))
? ptr
: NULL;
};

system_deallocator_ =
[](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* {
assert(align <= 4096);
void* ptr = nullptr;
core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr);
return ptr;
};

system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };

BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_);
}
Expand Down Expand Up @@ -451,9 +445,8 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
would remain mapped for the duration of the copy.
*/
void* temp = nullptr;
system_region->Allocate(size, core::MemoryRegion::AllocateNoFlags, &temp);
MAKE_SCOPE_GUARD([&]() { system_region->Free(temp, size); });
void* temp = system_allocator_(size, 0, core::MemoryRegion::AllocateNoFlags);
MAKE_SCOPE_GUARD([&]() { system_deallocator_(temp); });
hsa_status_t err = src_agent->DmaCopy(temp, source, size);
if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
return err;
Expand Down Expand Up @@ -1273,7 +1266,8 @@ Runtime::Runtime()
sys_clock_freq_(0),
vm_fault_event_(nullptr),
vm_fault_signal_(nullptr),
ref_count_(0) {}
ref_count_(0),
kfd_version{0} {}

hsa_status_t Runtime::Load() {
flag_.Refresh();
Expand Down
1 change: 1 addition & 0 deletions src/hsacore.so.def
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ global:
hsa_amd_queue_set_priority;
hsa_amd_register_deallocation_callback;
hsa_amd_deregister_deallocation_callback;
hsa_amd_signal_value_pointer;
_amdgpu_r_debug;

local:
Expand Down
2 changes: 0 additions & 2 deletions src/image/addrlib/src/core/addrlib2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,6 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
}
}

ADDR_ASSERT(pOut->surfSize != 0);

ValidBaseAlignments(pOut->baseAlign);

return returnCode;
Expand Down
3 changes: 3 additions & 0 deletions src/image/blit_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ extern uint8_t ocl_blit_object_gfx1012[];
extern uint8_t ocl_blit_object_gfx1030[];
extern uint8_t ocl_blit_object_gfx1031[];
extern uint8_t ocl_blit_object_gfx1032[];
extern uint8_t ocl_blit_object_gfx1033[];

// Arguments inserted by OCL compiler, all zero here.
struct OCLHiddenArgs {
Expand Down Expand Up @@ -1001,6 +1002,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
*blit_code_object = ocl_blit_object_gfx1031;
} else if (sname == "gfx1032") {
*blit_code_object = ocl_blit_object_gfx1032;
} else if (sname == "gfx1033") {
*blit_code_object = ocl_blit_object_gfx1033;
} else {
return HSA_STATUS_ERROR_INVALID_ISA_NAME;
}
Expand Down
2 changes: 1 addition & 1 deletion src/image/blit_src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ endif()

# Determine the target devices if not specified
if (NOT DEFINED TARGET_DEVICES)
set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032")
set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033")
endif()
set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )

Expand Down
10 changes: 9 additions & 1 deletion src/image/image_manager_nv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,11 @@ hsa_status_t ImageManagerNv::PopulateImageSrd(Image& image) const {
word4.f.DEPTH =
(image_array) // Doesn't hurt but isn't array_size already >0?
? std::max(image.desc.array_size, static_cast<size_t>(1)) - 1
: (image_3d) ? image.desc.depth - 1 : out.pitch - 1;
: (image_3d) ? image.desc.depth - 1 : 0;
uint32_t minor_ver = MinorVerFromDevID(chip_id_);
// For 1d, 2d and 2d-msaa in gfx1030 and beyond this is pitch-1
if ((minor_ver >= 3) && !image_array && !image_3d)
word4.f.PITCH = out.pitch - 1;

word5.val = 0;
word6.val = 0;
Expand Down Expand Up @@ -630,13 +634,17 @@ uint32_t ImageManagerNv::GetAddrlibSurfaceInfoNv(
const uint32_t num_slice = static_cast<uint32_t>(
std::max(kMinNumSlice, std::max(desc.array_size, desc.depth)));

uint32_t minor_ver = MinorVerFromDevID(chip_id_);
ADDR2_COMPUTE_SURFACE_INFO_INPUT in = {0};
in.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
in.format = addrlib_format;
in.bpp = static_cast<unsigned int>(image_prop.element_size) * 8;
in.width = width;
in.height = height;
in.numSlices = num_slice;
// Custom Pitch is supported in gfx1030 and beyond
if (minor_ver >= 3)
in.pitchInElement = image_data_row_pitch / image_prop.element_size;
switch (desc.geometry) {
case HSA_EXT_IMAGE_GEOMETRY_1D:
case HSA_EXT_IMAGE_GEOMETRY_1DB:
Expand Down
Loading

0 comments on commit ea47a79

Please sign in to comment.