Skip to content

Commit

Permalink
ROCm 5.0.0 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
skeelyamd committed Feb 10, 2022
1 parent b3df226 commit 635157e
Show file tree
Hide file tree
Showing 24 changed files with 413 additions and 85 deletions.
9 changes: 5 additions & 4 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ if (ROCM_CCACHE_BUILD)
endif() # if (ROCM_CCACHE_BUILD)

## Get version strings
get_version ( "1.4.0" )
get_version ( "1.5.0" )
if ( ${ROCM_PATCH_VERSION} )
set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
endif()
Expand Down Expand Up @@ -126,7 +126,7 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64" )

## ------------------------- Linux Compiler and Linker options -------------------------
set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -Werror -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function )
set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function )

## Extra image settings - audit!
set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -Wno-deprecated-declarations )
Expand Down Expand Up @@ -301,7 +301,7 @@ install ( TARGETS ${CORE_RUNTIME_TARGET} EXPORT ${CORE_RUNTIME_NAME}Targets
LIBRARY DESTINATION lib COMPONENT binary )

# Install license
#install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )

# Install public headers
# TODO: Fix me for flat directory layout. Should be ${CMAKE_INSTALL_INCLUDEDIR}
Expand Down Expand Up @@ -420,7 +420,7 @@ set ( CPACK_DEBIAN_DEV_PACKAGE_CONTROL_EXTRA "DEBIAN/Dev/postinst;DEBIAN/Dev/pre
set ( CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "hsakmt-roct" )
set ( CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "hsa-rocr" )
if ( ROCM_DEP_ROCMCORE )
string ( APPEND PACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocm-core" )
string ( APPEND CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocm-core" )
string ( APPEND CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ", rocm-core" )
endif()
set ( CPACK_DEBIAN_PACKAGE_BREAKS "hsa-ext-rocr-dev" )
Expand All @@ -438,6 +438,7 @@ endif()
string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" )
message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
set( CPACK_RPM_PACKAGE_LICENSE "NCSA" )

## Process the Rpm install/remove scripts to update the CPACK variables
configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/Binary/post.in" RPM/Binary/post @ONLY )
Expand Down
15 changes: 15 additions & 0 deletions src/core/inc/agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,15 @@
#include "core/inc/queue.h"
#include "core/inc/memory_region.h"
#include "core/util/utils.h"
#include "core/util/locks.h"

namespace rocr {

// Forward declare AMD::MemoryRegion
namespace AMD {
class MemoryRegion;
}

namespace core {
class Signal;

Expand All @@ -65,6 +72,8 @@ typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source,
// replaced by tools libraries. All funtions other than Convert, node_id,
// device_type, and public_handle must be virtual.
class Agent : public Checked<0xF6BC25EB17E6F917> {
friend class rocr::AMD::MemoryRegion;

public:
// @brief Convert agent object into hsa_agent_t.
//
Expand Down Expand Up @@ -297,6 +306,12 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {

bool profiling_enabled_;

// Used by an Agent's MemoryRegions to ensure serial memory operation on the device.
// Serial memory operations are needed to ensure, among other things, that allocation failures are
// due to true OOM conditions and per region caching (Trim and Allocate must be serial and
// exclusive to ensure this).
KernelMutex agent_memory_lock_;

// Forbid copying and moving of this object
DISALLOW_COPY_AND_ASSIGN(Agent);
};
Expand Down
10 changes: 9 additions & 1 deletion src/core/inc/amd_memory_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ class MemoryRegion : public core::MemoryRegion {

HSAuint64 virtual_size_;

// Protects against concurrent allow_access calls to fragments of the same block by virtue of all
// fragments of the block routing to the same MemoryRegion.
mutable KernelMutex access_lock_;

static const size_t kPageSize_ = 4096;
Expand All @@ -193,14 +195,20 @@ class MemoryRegion : public core::MemoryRegion {
hsa_amd_memory_pool_access_t GetAccessInfo(const core::Agent& agent,
const core::Runtime::LinkInfo& link_info) const;

// Operational body for Allocate. Recursive.
hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;

// Operational body for Free. Recursive.
hsa_status_t FreeImpl(void* address, size_t size) const;

class BlockAllocator {
private:
MemoryRegion& region_;
static const size_t block_size_ = 2 * 1024 * 1024; // 2MB blocks.
public:
explicit BlockAllocator(MemoryRegion& region) : region_(region) {}
void* alloc(size_t request_size, size_t& allocated_size) const;
void free(void* ptr, size_t length) const { region_.Free(ptr, length); }
void free(void* ptr, size_t length) const { region_.FreeImpl(ptr, length); }
size_t block_size() const { return block_size_; }
};

Expand Down
2 changes: 1 addition & 1 deletion src/core/inc/queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ struct AqlPacket {
}

bool IsValid() const {
return (type() <= HSA_PACKET_TYPE_BARRIER_OR) & (type() != HSA_PACKET_TYPE_INVALID);
return int(type() <= HSA_PACKET_TYPE_BARRIER_OR) & (type() != HSA_PACKET_TYPE_INVALID);
}

std::string string() const {
Expand Down
8 changes: 6 additions & 2 deletions src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -487,10 +487,14 @@ class Runtime {
/// @brief Get most recently issued SVM prefetch agent for the range in question.
Agent* GetSVMPrefetchAgent(void* ptr, size_t size);

// Mutex object to protect multithreaded access to ::allocation_map_,
/// @brief Get the highest used node id.
uint32_t max_node_id() const { return agents_by_node_.rbegin()->first; }

// Mutex object to protect multithreaded access to ::allocation_map_.
// Also ensures atomicity of pointer info queries by interlocking
// KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
// registered & mapped arrays.
KernelMutex memory_lock_;
KernelSharedMutex memory_lock_;

// Array containing tools library handles.
std::vector<os::LibHandle> tool_libs_;
Expand Down
2 changes: 1 addition & 1 deletion src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
MAKE_NAMED_SCOPE_GUARD(PM4IBGuard, [&]() { agent_->system_deallocator()(pm4_ib_buf_); });

// Set initial CU mask
SetCUMasking(0, nullptr);
if (!core::Runtime::runtime_singleton_->flag().cu_mask_skip_init()) SetCUMasking(0, nullptr);

active_ = true;

Expand Down
42 changes: 33 additions & 9 deletions src/core/runtime/amd_gpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -599,14 +599,21 @@ void GpuAgent::InitDma() {
queues_[QueueUtility].reset(queue_lambda);

// Decide which engine to use for blits.
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue) {
auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();

// User SDMA queues are unstable on gfx8.
bool use_sdma = ((isa_->GetMajorVersion() != 8));
// User SDMA queues are unstable on gfx8 and unsupported on gfx1013.
bool use_sdma =
((isa_->GetMajorVersion() != 8) && (isa_->GetVersion() != std::make_tuple(10, 1, 3)));
if (sdma_override != Flag::SDMA_DEFAULT) use_sdma = (sdma_override == Flag::SDMA_ENABLE);

if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
// On gfx90a ensure that HostToDevice queue is created first and so is placed on SDMA0.
if ((!use_xgmi) && (!isHostToDev) && (isa_->GetMajorVersion() == 9) &&
(isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) {
*blits_[BlitHostToDev];
}

auto ret = CreateBlitSdma(use_xgmi);
if (ret != nullptr) return ret;
}
Expand Down Expand Up @@ -642,13 +649,14 @@ void GpuAgent::InitDma() {
return ret;
});
blits_[BlitHostToDev].reset(
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly]); });
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); });
blits_[BlitDevToHost].reset(
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
[blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); });

// XGMI engines.
for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
blits_[idx].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility]); });
blits_[idx].reset(
[blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); });
}

// GWS queues.
Expand Down Expand Up @@ -794,7 +802,11 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR;
break;
case HSA_AGENT_INFO_FAST_F16_OPERATION:
*((bool*)value) = false;
if (isa_->GetMajorVersion() >= 8) {
*((bool*)value) = true;
} else {
*((bool*)value) = false;
}
break;
case HSA_AGENT_INFO_PROFILE:
*((hsa_profile_t*)value) = profile_;
Expand Down Expand Up @@ -998,6 +1010,17 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
case HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS:
assert(regions_.size() != 0 && "No device local memory found!");
*((bool*)value) = properties_.Capability.ui32.CoherentHostAccess == 1;
case HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT:
if (core::Runtime::runtime_singleton_->flag().coop_cu_count() &&
(isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) &&
(isa_->GetStepping() == 10)) {
uint32_t count = 0;
hsa_status_t err = GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &count);
assert(err == HSA_STATUS_SUCCESS && "CU count query failed.");
*((uint32_t*)value) = (count & 0xFFFFFFF8) - 8; // value = floor(count/8)*8-8
break;
}
return GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, value);
default:
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
break;
Expand Down Expand Up @@ -1504,10 +1527,11 @@ lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
lazy_ptr<core::Blit>& blit =
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
? blits_[BlitHostToDev]
? blits_[BlitHostToDev] // CPU->GPU transfer.
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
? blits_[BlitDevToHost] : blits_[BlitDevToHost];
? blits_[BlitDevToHost] // GPU->CPU transfer.
: blits_[BlitDevToHost]; // GPU->GPU transfer.
return blit;
}

Expand Down
22 changes: 17 additions & 5 deletions src/core/runtime/amd_memory_region.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, cor
MemoryRegion::~MemoryRegion() {}

hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
return AllocateImpl(size, alloc_flags, address);
}

hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
void** address) const {
if (address == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
Expand Down Expand Up @@ -251,6 +257,11 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi
}

hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
return FreeImpl(address, size);
}

hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;

MakeKfdMemoryUnresident(address);
Expand All @@ -262,6 +273,7 @@ hsa_status_t MemoryRegion::Free(void* address, size_t size) const {

// TODO: Look into a better name and/or making this process transparent to exporting.
hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const {
ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
return HSA_STATUS_SUCCESS;
}
Expand Down Expand Up @@ -583,8 +595,10 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
HsaMemMapFlags map_flag = map_flag_;
map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0;

{
ScopedAcquire<KernelMutex> lock(&core::Runtime::runtime_singleton_->memory_lock_);
{ // Sequence with pointer info since queries to other fragments of the block may be adjusted by
// this call.
ScopedAcquire<KernelSharedMutex::Shared> lock(
core::Runtime::runtime_singleton_->memory_lock_.shared());
uint64_t alternate_va = 0;
if (!AMD::MemoryRegion::MakeKfdMemoryResident(
whitelist_nodes.size(), &whitelist_nodes[0], ptr,
Expand All @@ -593,8 +607,6 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
}
}

lock.Release();

return HSA_STATUS_SUCCESS;
}

Expand Down Expand Up @@ -700,7 +712,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
void* ret;
size_t bsize = AlignUp(request_size, block_size());

hsa_status_t err = region_.Allocate(
hsa_status_t err = region_.AllocateImpl(
bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
if (err != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
Expand Down
22 changes: 13 additions & 9 deletions src/core/runtime/hsa_ext_amd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -567,14 +567,14 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
void** agent_ptr) {
TRY;
IS_OPEN();
*agent_ptr = NULL;

if (size == 0 || host_ptr == NULL || agent_ptr == NULL) {
if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}

if ((agents != NULL && num_agent == 0) ||
(agents == NULL && num_agent != 0)) {
*agent_ptr = nullptr;

if ((agents != nullptr && num_agent == 0) || (agents == nullptr && num_agent != 0)) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}

Expand All @@ -598,13 +598,14 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_
void** agent_ptr) {
TRY;
IS_OPEN();
*agent_ptr = NULL;

if (size == 0 || host_ptr == NULL || agent_ptr == NULL || flags != 0) {
if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr || flags != 0) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}

if ((agents != NULL && num_agent == 0) || (agents == NULL && num_agent != 0)) {
*agent_ptr = nullptr;

if ((agents != nullptr && num_agent == 0) || (agents == nullptr && num_agent != 0)) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}

Expand Down Expand Up @@ -806,9 +807,13 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents,
core::Agent** core_agents = short_agents;
if (num_agents > tinyArraySize) {
core_agents = new core::Agent* [num_agents];
if (core_agents == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
if (core_agents == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

MAKE_SCOPE_GUARD([&]() {
if (num_agents > tinyArraySize) delete[] core_agents;
});

for (uint32_t i = 0; i < num_agents; i++) {
core::Agent* device = core::Agent::Convert(agents[i]);
IS_VALID(device);
Expand All @@ -819,7 +824,6 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents,
num_agents, core_agents, interop_handle, flags, size, ptr, metadata_size,
metadata);

if (num_agents > tinyArraySize) delete[] core_agents;
return ret;
CATCH;
}
Expand Down
3 changes: 3 additions & 0 deletions src/core/runtime/isa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ constexpr size_t hsa_name_size = 63;
ISAREG_ENTRY_GEN("gfx1012", 10, 1, 2, unsupported, any)
ISAREG_ENTRY_GEN("gfx1012:xnack-", 10, 1, 2, unsupported, disabled)
ISAREG_ENTRY_GEN("gfx1012:xnack+", 10, 1, 2, unsupported, enabled)
ISAREG_ENTRY_GEN("gfx1013", 10, 1, 3, unsupported, any)
ISAREG_ENTRY_GEN("gfx1013:xnack-", 10, 1, 3, unsupported, disabled)
ISAREG_ENTRY_GEN("gfx1013:xnack+", 10, 1, 3, unsupported, enabled)
ISAREG_ENTRY_GEN("gfx1030", 10, 3, 0, unsupported, unsupported)
ISAREG_ENTRY_GEN("gfx1031", 10, 3, 1, unsupported, unsupported)
ISAREG_ENTRY_GEN("gfx1032", 10, 3, 2, unsupported, unsupported)
Expand Down
Loading

0 comments on commit 635157e

Please sign in to comment.