diff --git a/sample/vector_copy.c b/sample/vector_copy.c index 44e1795f0..d1de531bb 100644 --- a/sample/vector_copy.c +++ b/sample/vector_copy.c @@ -49,7 +49,7 @@ #define check(msg, status) \ if (status != HSA_STATUS_SUCCESS) { \ - printf("%s failed: %x.\n", #msg, status); \ + printf("%s failed.\n", #msg); \ exit(1); \ } else { \ printf("%s succeeded.\n", #msg); \ @@ -230,11 +230,10 @@ int main(int argc, char **argv) { */ hsa_ext_module_t module; if(HSA_PROFILE_FULL == profile) { - err = load_module_from_file("vector_copy_full.brig",&module); + load_module_from_file("vector_copy_full.brig",&module); } else { - err = load_module_from_file("vector_copy_base.brig",&module); + load_module_from_file("vector_copy_base.brig",&module); } - check(Load module from file, err); /* * Create hsa program. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7c9ef0c84..d9a98c972 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -192,7 +192,7 @@ set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pos ## RPM package specific variables set ( CPACK_RPM_PACKAGE_DEPENDS "hsakmt-roct-dev" ) -set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) +set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) ## Include packaging diff --git a/src/cmake_modules/hsa_common.cmake b/src/cmake_modules/hsa_common.cmake index 26971c306..0b907bf3d 100644 --- a/src/cmake_modules/hsa_common.cmake +++ b/src/cmake_modules/hsa_common.cmake @@ -55,7 +55,7 @@ if(UNIX) set(PS ":") set(CMAKE_CXX_FLAGS "-Wall -std=c++11 ${EXTRA_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-undefined") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--unresolved-symbols=ignore-in-shared-libs") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing") if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" ) diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h index d85d9443a..d2321dfd4 100644 --- a/src/core/inc/amd_memory_region.h +++ b/src/core/inc/amd_memory_region.h @@ -86,13 +86,11 @@ class MemoryRegion : public core::MemoryRegion { static void DeregisterMemory(void* ptr); /// @brief Pin memory. - static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, - void* ptr, size_t size, - uint64_t* alternate_va, - HsaMemMapFlags map_flag); + static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr, + size_t size, uint64_t* alternate_va, HsaMemMapFlags map_flag); /// @brief Unpin memory. - static void MakeKfdMemoryUnresident(void* ptr); + static void MakeKfdMemoryUnresident(const void* ptr); MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props); diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index c8d00bcb6..06eff531a 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -315,12 +315,11 @@ class Runtime { static void AsyncEventsLoop(void*); struct AllocationRegion { - AllocationRegion() : region(NULL), assigned_agent_(NULL), size(0) {} + AllocationRegion() : region(NULL), size(0) {} AllocationRegion(const MemoryRegion* region_arg, size_t size_arg) - : region(region_arg), assigned_agent_(NULL), size(size_arg) {} + : region(region_arg), size(size_arg) {} const MemoryRegion* region; - const Agent* assigned_agent_; size_t size; }; diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 08ea936a5..1fa48a94d 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -566,7 +566,7 @@ void GpuAgent::InitDma() { if (!blit_initialized_.load(std::memory_order_relaxed)) { // Try create SDMA blit first. // TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified. - if ((isa_->GetMajorVersion() != 9) && + if ((isa_->GetMajorVersion() != 8) && core::Runtime::runtime_singleton_->flag().enable_sdma() && (HSA_PROFILE_BASE == profile_)) { blits_[BlitHostToDev] = CreateBlitSdma(); diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index 295be378d..00642f3de 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -81,23 +81,21 @@ bool MemoryRegion::RegisterMemory(void* ptr, size_t size, size_t num_nodes, void MemoryRegion::DeregisterMemory(void* ptr) { hsaKmtDeregisterMemory(ptr); } -bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, - void* ptr, size_t size, - uint64_t* alternate_va, +bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr, + size_t size, uint64_t* alternate_va, HsaMemMapFlags map_flag) { assert(num_node > 0); assert(nodes != NULL); *alternate_va = 0; - const HSAKMT_STATUS status = - hsaKmtMapMemoryToGPUNodes(ptr, size, alternate_va, map_flag, num_node, - const_cast(nodes)); + const HSAKMT_STATUS status = hsaKmtMapMemoryToGPUNodes( + const_cast(ptr), size, alternate_va, map_flag, num_node, const_cast(nodes)); return (status == HSAKMT_STATUS_SUCCESS); } -void MemoryRegion::MakeKfdMemoryUnresident(void* ptr) { - hsaKmtUnmapMemoryToGPU(ptr); +void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { + hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, @@ -454,15 +452,16 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, if (whitelist_nodes.size() == 0 && IsSystem()) { assert(cpu_in_list); // This is a system region and only CPU agents in the whitelist. - // No need to call map. + // Remove old mappings. + amd::MemoryRegion::MakeKfdMemoryUnresident(ptr); return HSA_STATUS_SUCCESS; } // If this is a local memory region, the owning gpu always needs to be in // the whitelist. if (IsPublic() && - std::find(whitelist_nodes.begin(), whitelist_nodes.end(), - owner()->node_id()) == whitelist_nodes.end()) { + std::find(whitelist_nodes.begin(), whitelist_nodes.end(), owner()->node_id()) == + whitelist_nodes.end()) { whitelist_nodes.push_back(owner()->node_id()); whitelist_gpus.insert(reinterpret_cast(owner())); } @@ -470,12 +469,11 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, HsaMemMapFlags map_flag = map_flag_; map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0; - { ScopedAcquire lock(&core::Runtime::runtime_singleton_->memory_lock_); uint64_t alternate_va = 0; if (!amd::MemoryRegion::MakeKfdMemoryResident( - whitelist_nodes.size(), &whitelist_nodes[0], const_cast(ptr), + whitelist_nodes.size(), &whitelist_nodes[0], ptr, size, &alternate_va, map_flag)) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 363d6c20d..a4a15fa03 100644 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -196,10 +196,14 @@ const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() { ISAREG_ENTRY_GEN(7, 0, 0) ISAREG_ENTRY_GEN(7, 0, 1) + ISAREG_ENTRY_GEN(7, 0, 2) ISAREG_ENTRY_GEN(8, 0, 1) ISAREG_ENTRY_GEN(8, 0, 2) ISAREG_ENTRY_GEN(8, 0, 3) ISAREG_ENTRY_GEN(9, 0, 0) + ISAREG_ENTRY_GEN(9, 0, 1) + ISAREG_ENTRY_GEN(9, 0, 2) + ISAREG_ENTRY_GEN(9, 0, 3) return supported_isas; } diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index f2cf2d55c..442fe82f5 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -453,8 +453,42 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent, } hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) { - assert(blit_agent_ != NULL); - return blit_agent_->DmaFill(ptr, value, count); + // Choose blit agent from pointer info + hsa_amd_pointer_info_t info; + uint32_t agent_count; + hsa_agent_t* accessible = nullptr; + info.size = sizeof(info); + MAKE_SCOPE_GUARD([&]() { free(accessible); }); + hsa_status_t err = PtrInfo(ptr, &info, malloc, &agent_count, &accessible); + if (err != HSA_STATUS_SUCCESS) return err; + + ptrdiff_t endPtr = (ptrdiff_t)ptr + count * sizeof(uint32_t); + + // Check for GPU fill + // Selects GPU fill for SVM and Locked allocations if a GPU address is given and is mapped. + if (info.agentBaseAddress <= ptr && + endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) { + core::Agent* blit_agent = core::Agent::Convert(info.agentOwner); + if (blit_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) { + blit_agent = nullptr; + for (int i = 0; i < agent_count; i++) { + if (core::Agent::Convert(accessible[i])->device_type() == + core::Agent::DeviceType::kAmdGpuDevice) { + blit_agent = core::Agent::Convert(accessible[i]); + break; + } + } + } + if (blit_agent) return blit_agent->DmaFill(ptr, value, count); + } + + // Host and unmapped SVM addresses copy via host. + if (info.hostBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.hostBaseAddress + info.sizeInBytes) { + memset(ptr, value, count * sizeof(uint32_t)); + return HSA_STATUS_SUCCESS; + } + + return HSA_STATUS_ERROR_INVALID_ALLOCATION; } hsa_status_t Runtime::AllowAccess(uint32_t num_agents, @@ -646,6 +680,8 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a HsaPointerInfo thunkInfo; uint32_t* mappedNodes; + hsa_amd_pointer_info_t retInfo; + // check output struct is at least as large as the first info revision. if (info->size < sizeof(struct hsa_amd_pointer_info_v1_s)) return HSA_STATUS_ERROR_INVALID_ARGUMENT; @@ -674,12 +710,15 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS, "Thunk pointer info mismatch"); - info->size = Min(info->size, sizeof(struct hsa_amd_pointer_info_v1_s)); - info->type = (hsa_amd_pointer_type_t)thunkInfo.Type; - info->agentBaseAddress = (void*)thunkInfo.GPUAddress; - info->hostBaseAddress = thunkInfo.CPUAddress; - info->sizeInBytes = thunkInfo.SizeInBytes; - info->userData = thunkInfo.UserData; + retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t)); + retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type; + retInfo.agentBaseAddress = reinterpret_cast(thunkInfo.GPUAddress); + retInfo.hostBaseAddress = thunkInfo.CPUAddress; + retInfo.sizeInBytes = thunkInfo.SizeInBytes; + retInfo.userData = thunkInfo.UserData; + retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle(); + + memcpy(info, &retInfo, retInfo.size); if (returnListData) { uint32_t count = 0; diff --git a/src/core/util/flag.h b/src/core/util/flag.h index d7add470c..ce256e6bd 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -69,18 +69,9 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_INTERRUPT"); enable_interrupt_ = (var == "0") ? false : true; - var = os::GetEnvVar("HSA_ENABLE_THREAD_TRACE"); - enable_thread_trace_ = (var == "1") ? true : false; - - var = os::GetEnvVar("HSA_THREAD_TRACE_MEM_SIZE"); - thread_trace_buff_size_ = atoi(var.c_str()); - var = os::GetEnvVar("HSA_ENABLE_SDMA"); enable_sdma_ = (var == "0") ? false : true; - var = os::GetEnvVar("HSA_EMULATE_AQL"); - emulate_aql_ = (var == "1") ? true : false; - var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND"); running_valgrind_ = (var == "1") ? true : false; @@ -104,14 +95,8 @@ class Flag { bool enable_interrupt() const { return enable_interrupt_; } - bool enable_thread_trace() const { return enable_thread_trace_; } - - bool thread_trace_buff_size() const { return thread_trace_buff_size_; } - bool enable_sdma() const { return enable_sdma_; } - bool emulate_aql() const { return emulate_aql_; } - bool running_valgrind() const { return running_valgrind_; } bool sdma_wait_idle() const { return sdma_wait_idle_; } @@ -127,14 +112,10 @@ class Flag { bool enable_vm_fault_message_; bool enable_interrupt_; bool enable_sdma_; - bool emulate_aql_; bool running_valgrind_; bool sdma_wait_idle_; bool enable_queue_fault_message_; - bool enable_thread_trace_; - size_t thread_trace_buff_size_; - uint32_t max_queues_; size_t scratch_mem_size_; diff --git a/src/core/util/timer.h b/src/core/util/timer.h index 65b9dfb6b..914bda34e 100644 --- a/src/core/util/timer.h +++ b/src/core/util/timer.h @@ -147,8 +147,7 @@ class fast_clock { #ifdef __x86_64__ static __forceinline raw_rep raw_now() { return __rdtsc(); } static __forceinline raw_frequency raw_freq() { return freq; } -#endif -#ifdef __aarch64__ +#else static __forceinline raw_rep raw_now() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); diff --git a/src/core/util/utils.h b/src/core/util/utils.h index b3e60a13e..536d81632 100644 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -56,11 +56,6 @@ typedef uint64_t uint64; #if defined(__GNUC__) #if defined(__i386__) || defined(__x86_64__) #include -#elif defined(__aarch64__) -#else -#error \ - "Processor or compiler not identified. " \ - "Need to provide a lightweight approximate clock interface via function uint64_t __rdtsc() or adapt timer.h to your platform." #endif #define __forceinline __inline__ __attribute__((always_inline)) diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index e184ff15c..85bddcc51 100755 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -1164,14 +1164,17 @@ hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); * * @param[in] count Number of uint32_t element to be set to the value. * - * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * @retval HSA_STATUS_SUCCESS The function has been executed successfully. * - * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * - * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or * not 4 bytes aligned * + * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory + * region was not allocated with HSA runtime APIs. + * */ hsa_status_t HSA_API hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count); @@ -1346,6 +1349,23 @@ typedef struct hsa_amd_pointer_info_v1_s { Application provided value. */ void* userData; +} hsa_amd_pointer_info_v1_t; + +/** + * @brief Minor version updates to pointer info. + */ +#ifdef __cplusplus +typedef struct hsa_amd_pointer_info_v2_s : hsa_amd_pointer_info_v1_t { +#else +typedef struct hsa_amd_pointer_info_v2_t { + struct hsa_amd_pointer_info_v1_t; +#endif + /* + Reports an agent which "owns" (ie has preferred access to) the pool in which the allocation was + made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die + GPU boards) any such agent may be returned. + */ + hsa_agent_t agentOwner; } hsa_amd_pointer_info_t; /** diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index 5780e3b99..5579a54f0 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -1505,15 +1505,11 @@ namespace code { bool AmdHsaCode::PullElfV2() { - Segment* note = NULL; for (size_t i = 0; i < img->segmentCount(); ++i) { Segment* s = img->segment(i); if (s->type() == PT_LOAD) { dataSegments.push_back(s); } - else if (s->type() == PT_NOTE && s->align() >= 4) { - note = s; - } } for (size_t i = 0; i < img->sectionCount(); ++i) { Section* sec = img->section(i);