Skip to content

Commit

Permalink
ROCm 1.6 updates.
Browse files Browse the repository at this point in the history
  • Loading branch information
jedwards-AMD committed Jun 29, 2017
1 parent f92eba2 commit 9f1f9f8
Show file tree
Hide file tree
Showing 14 changed files with 97 additions and 69 deletions.
7 changes: 3 additions & 4 deletions sample/vector_copy.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@

#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
printf("%s failed: %x.\n", #msg, status); \
printf("%s failed.\n", #msg); \
exit(1); \
} else { \
printf("%s succeeded.\n", #msg); \
Expand Down Expand Up @@ -230,11 +230,10 @@ int main(int argc, char **argv) {
*/
hsa_ext_module_t module;
if(HSA_PROFILE_FULL == profile) {
err = load_module_from_file("vector_copy_full.brig",&module);
load_module_from_file("vector_copy_full.brig",&module);
} else {
err = load_module_from_file("vector_copy_base.brig",&module);
load_module_from_file("vector_copy_base.brig",&module);
}
check(Load module from file, err);

/*
* Create hsa program.
Expand Down
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pos

## RPM package specific variables
set ( CPACK_RPM_PACKAGE_DEPENDS "hsakmt-roct-dev" )
set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" )

## Include packaging
Expand Down
2 changes: 1 addition & 1 deletion src/cmake_modules/hsa_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ if(UNIX)
set(PS ":")
set(CMAKE_CXX_FLAGS "-Wall -std=c++11 ${EXTRA_CFLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-undefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--unresolved-symbols=ignore-in-shared-libs")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing")
if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" )
Expand Down
8 changes: 3 additions & 5 deletions src/core/inc/amd_memory_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,11 @@ class MemoryRegion : public core::MemoryRegion {
static void DeregisterMemory(void* ptr);

/// @brief Pin memory.
static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes,
void* ptr, size_t size,
uint64_t* alternate_va,
HsaMemMapFlags map_flag);
static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr,
size_t size, uint64_t* alternate_va, HsaMemMapFlags map_flag);

/// @brief Unpin memory.
static void MakeKfdMemoryUnresident(void* ptr);
static void MakeKfdMemoryUnresident(const void* ptr);

MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
const HsaMemoryProperties& mem_props);
Expand Down
5 changes: 2 additions & 3 deletions src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,12 +315,11 @@ class Runtime {
static void AsyncEventsLoop(void*);

struct AllocationRegion {
AllocationRegion() : region(NULL), assigned_agent_(NULL), size(0) {}
AllocationRegion() : region(NULL), size(0) {}
AllocationRegion(const MemoryRegion* region_arg, size_t size_arg)
: region(region_arg), assigned_agent_(NULL), size(size_arg) {}
: region(region_arg), size(size_arg) {}

const MemoryRegion* region;
const Agent* assigned_agent_;
size_t size;
};

Expand Down
2 changes: 1 addition & 1 deletion src/core/runtime/amd_gpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ void GpuAgent::InitDma() {
if (!blit_initialized_.load(std::memory_order_relaxed)) {
// Try create SDMA blit first.
// TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified.
if ((isa_->GetMajorVersion() != 9) &&
if ((isa_->GetMajorVersion() != 8) &&
core::Runtime::runtime_singleton_->flag().enable_sdma() &&
(HSA_PROFILE_BASE == profile_)) {
blits_[BlitHostToDev] = CreateBlitSdma();
Expand Down
24 changes: 11 additions & 13 deletions src/core/runtime/amd_memory_region.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,23 +81,21 @@ bool MemoryRegion::RegisterMemory(void* ptr, size_t size, size_t num_nodes,

void MemoryRegion::DeregisterMemory(void* ptr) { hsaKmtDeregisterMemory(ptr); }

bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes,
void* ptr, size_t size,
uint64_t* alternate_va,
bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr,
size_t size, uint64_t* alternate_va,
HsaMemMapFlags map_flag) {
assert(num_node > 0);
assert(nodes != NULL);

*alternate_va = 0;
const HSAKMT_STATUS status =
hsaKmtMapMemoryToGPUNodes(ptr, size, alternate_va, map_flag, num_node,
const_cast<uint32_t*>(nodes));
const HSAKMT_STATUS status = hsaKmtMapMemoryToGPUNodes(
const_cast<void*>(ptr), size, alternate_va, map_flag, num_node, const_cast<uint32_t*>(nodes));

return (status == HSAKMT_STATUS_SUCCESS);
}

void MemoryRegion::MakeKfdMemoryUnresident(void* ptr) {
hsaKmtUnmapMemoryToGPU(ptr);
void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
}

MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,
Expand Down Expand Up @@ -454,28 +452,28 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
if (whitelist_nodes.size() == 0 && IsSystem()) {
assert(cpu_in_list);
// This is a system region and only CPU agents in the whitelist.
// No need to call map.
// Remove old mappings.
amd::MemoryRegion::MakeKfdMemoryUnresident(ptr);
return HSA_STATUS_SUCCESS;
}

// If this is a local memory region, the owning gpu always needs to be in
// the whitelist.
if (IsPublic() &&
std::find(whitelist_nodes.begin(), whitelist_nodes.end(),
owner()->node_id()) == whitelist_nodes.end()) {
std::find(whitelist_nodes.begin(), whitelist_nodes.end(), owner()->node_id()) ==
whitelist_nodes.end()) {
whitelist_nodes.push_back(owner()->node_id());
whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(owner()));
}

HsaMemMapFlags map_flag = map_flag_;
map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0;


{
ScopedAcquire<KernelMutex> lock(&core::Runtime::runtime_singleton_->memory_lock_);
uint64_t alternate_va = 0;
if (!amd::MemoryRegion::MakeKfdMemoryResident(
whitelist_nodes.size(), &whitelist_nodes[0], const_cast<void*>(ptr),
whitelist_nodes.size(), &whitelist_nodes[0], ptr,
size, &alternate_va, map_flag)) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
Expand Down
4 changes: 4 additions & 0 deletions src/core/runtime/isa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,14 @@ const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() {

ISAREG_ENTRY_GEN(7, 0, 0)
ISAREG_ENTRY_GEN(7, 0, 1)
ISAREG_ENTRY_GEN(7, 0, 2)
ISAREG_ENTRY_GEN(8, 0, 1)
ISAREG_ENTRY_GEN(8, 0, 2)
ISAREG_ENTRY_GEN(8, 0, 3)
ISAREG_ENTRY_GEN(9, 0, 0)
ISAREG_ENTRY_GEN(9, 0, 1)
ISAREG_ENTRY_GEN(9, 0, 2)
ISAREG_ENTRY_GEN(9, 0, 3)

return supported_isas;
}
Expand Down
55 changes: 47 additions & 8 deletions src/core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,42 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
}

hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) {
assert(blit_agent_ != NULL);
return blit_agent_->DmaFill(ptr, value, count);
// Choose blit agent from pointer info
hsa_amd_pointer_info_t info;
uint32_t agent_count;
hsa_agent_t* accessible = nullptr;
info.size = sizeof(info);
MAKE_SCOPE_GUARD([&]() { free(accessible); });
hsa_status_t err = PtrInfo(ptr, &info, malloc, &agent_count, &accessible);
if (err != HSA_STATUS_SUCCESS) return err;

ptrdiff_t endPtr = (ptrdiff_t)ptr + count * sizeof(uint32_t);

// Check for GPU fill
// Selects GPU fill for SVM and Locked allocations if a GPU address is given and is mapped.
if (info.agentBaseAddress <= ptr &&
endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
core::Agent* blit_agent = core::Agent::Convert(info.agentOwner);
if (blit_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
blit_agent = nullptr;
for (int i = 0; i < agent_count; i++) {
if (core::Agent::Convert(accessible[i])->device_type() ==
core::Agent::DeviceType::kAmdGpuDevice) {
blit_agent = core::Agent::Convert(accessible[i]);
break;
}
}
}
if (blit_agent) return blit_agent->DmaFill(ptr, value, count);
}

// Host and unmapped SVM addresses copy via host.
if (info.hostBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.hostBaseAddress + info.sizeInBytes) {
memset(ptr, value, count * sizeof(uint32_t));
return HSA_STATUS_SUCCESS;
}

return HSA_STATUS_ERROR_INVALID_ALLOCATION;
}

hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
Expand Down Expand Up @@ -646,6 +680,8 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
HsaPointerInfo thunkInfo;
uint32_t* mappedNodes;

hsa_amd_pointer_info_t retInfo;

// check output struct is at least as large as the first info revision.
if (info->size < sizeof(struct hsa_amd_pointer_info_v1_s)) return HSA_STATUS_ERROR_INVALID_ARGUMENT;

Expand Down Expand Up @@ -674,12 +710,15 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS,
"Thunk pointer info mismatch");

info->size = Min(info->size, sizeof(struct hsa_amd_pointer_info_v1_s));
info->type = (hsa_amd_pointer_type_t)thunkInfo.Type;
info->agentBaseAddress = (void*)thunkInfo.GPUAddress;
info->hostBaseAddress = thunkInfo.CPUAddress;
info->sizeInBytes = thunkInfo.SizeInBytes;
info->userData = thunkInfo.UserData;
retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
retInfo.hostBaseAddress = thunkInfo.CPUAddress;
retInfo.sizeInBytes = thunkInfo.SizeInBytes;
retInfo.userData = thunkInfo.UserData;
retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();

memcpy(info, &retInfo, retInfo.size);

if (returnListData) {
uint32_t count = 0;
Expand Down
19 changes: 0 additions & 19 deletions src/core/util/flag.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,9 @@ class Flag {
var = os::GetEnvVar("HSA_ENABLE_INTERRUPT");
enable_interrupt_ = (var == "0") ? false : true;

var = os::GetEnvVar("HSA_ENABLE_THREAD_TRACE");
enable_thread_trace_ = (var == "1") ? true : false;

var = os::GetEnvVar("HSA_THREAD_TRACE_MEM_SIZE");
thread_trace_buff_size_ = atoi(var.c_str());

var = os::GetEnvVar("HSA_ENABLE_SDMA");
enable_sdma_ = (var == "0") ? false : true;

var = os::GetEnvVar("HSA_EMULATE_AQL");
emulate_aql_ = (var == "1") ? true : false;

var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND");
running_valgrind_ = (var == "1") ? true : false;

Expand All @@ -104,14 +95,8 @@ class Flag {

bool enable_interrupt() const { return enable_interrupt_; }

bool enable_thread_trace() const { return enable_thread_trace_; }

bool thread_trace_buff_size() const { return thread_trace_buff_size_; }

bool enable_sdma() const { return enable_sdma_; }

bool emulate_aql() const { return emulate_aql_; }

bool running_valgrind() const { return running_valgrind_; }

bool sdma_wait_idle() const { return sdma_wait_idle_; }
Expand All @@ -127,14 +112,10 @@ class Flag {
bool enable_vm_fault_message_;
bool enable_interrupt_;
bool enable_sdma_;
bool emulate_aql_;
bool running_valgrind_;
bool sdma_wait_idle_;
bool enable_queue_fault_message_;

bool enable_thread_trace_;
size_t thread_trace_buff_size_;

uint32_t max_queues_;

size_t scratch_mem_size_;
Expand Down
3 changes: 1 addition & 2 deletions src/core/util/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ class fast_clock {
#ifdef __x86_64__
static __forceinline raw_rep raw_now() { return __rdtsc(); }
static __forceinline raw_frequency raw_freq() { return freq; }
#endif
#ifdef __aarch64__
#else
static __forceinline raw_rep raw_now() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
Expand Down
5 changes: 0 additions & 5 deletions src/core/util/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ typedef uint64_t uint64;
#if defined(__GNUC__)
#if defined(__i386__) || defined(__x86_64__)
#include <x86intrin.h>
#elif defined(__aarch64__)
#else
#error \
"Processor or compiler not identified. " \
"Need to provide a lightweight approximate clock interface via function uint64_t __rdtsc() or adapt timer.h to your platform."
#endif

#define __forceinline __inline__ __attribute__((always_inline))
Expand Down
26 changes: 23 additions & 3 deletions src/inc/hsa_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -1164,14 +1164,17 @@ hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
*
* @param[in] count Number of uint32_t element to be set to the value.
*
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
* @retval HSA_STATUS_SUCCESS The function has been executed successfully.
*
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
* @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
* initialized.
*
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
* not 4 bytes aligned
*
* @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory
* region was not allocated with HSA runtime APIs.
*
*/
hsa_status_t HSA_API
hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
Expand Down Expand Up @@ -1346,6 +1349,23 @@ typedef struct hsa_amd_pointer_info_v1_s {
Application provided value.
*/
void* userData;
} hsa_amd_pointer_info_v1_t;

/**
* @brief Minor version updates to pointer info.
*/
#ifdef __cplusplus
typedef struct hsa_amd_pointer_info_v2_s : hsa_amd_pointer_info_v1_t {
#else
typedef struct hsa_amd_pointer_info_v2_t {
struct hsa_amd_pointer_info_v1_t;
#endif
/*
Reports an agent which "owns" (ie has preferred access to) the pool in which the allocation was
made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die
GPU boards) any such agent may be returned.
*/
hsa_agent_t agentOwner;
} hsa_amd_pointer_info_t;

/**
Expand Down
4 changes: 0 additions & 4 deletions src/libamdhsacode/amd_hsa_code.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1505,15 +1505,11 @@ namespace code {

bool AmdHsaCode::PullElfV2()
{
Segment* note = NULL;
for (size_t i = 0; i < img->segmentCount(); ++i) {
Segment* s = img->segment(i);
if (s->type() == PT_LOAD) {
dataSegments.push_back(s);
}
else if (s->type() == PT_NOTE && s->align() >= 4) {
note = s;
}
}
for (size_t i = 0; i < img->sectionCount(); ++i) {
Section* sec = img->section(i);
Expand Down

0 comments on commit 9f1f9f8

Please sign in to comment.