diff --git a/README.md b/README.md index 718b54a9a..15ab7a41a 100644 --- a/README.md +++ b/README.md @@ -65,4 +65,4 @@ The information contained herein is for informational purposes only, and is subj AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. -Copyright (c) 2014-2017 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2014-2021 Advanced Micro Devices, Inc. All rights reserved. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dee403688..c81b196b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -191,7 +191,6 @@ set ( SRCS core/util/lnx/os_linux.cpp core/common/shared.cpp core/common/hsa_table_interface.cpp loader/executable.cpp - loader/loaders.cpp libamdhsacode/amd_elf_image.cpp libamdhsacode/amd_hsa_code_util.cpp libamdhsacode/amd_hsa_locks.cpp @@ -386,6 +385,7 @@ endif() ## Packaging directives set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build") +set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") ## Only pack the "binary" and "dev" components, post install script will add the directory link. set (CPACK_DEB_COMPONENT_INSTALL ON) @@ -399,9 +399,9 @@ set ( CPACK_PACKAGE_CONTACT "TODO Advanced Micro Devices, Inc." ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) -## Process the install scripts to update the CPACK variables -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/post_install DEBIAN/postinst @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pre_remove DEBIAN/prerm @ONLY) +## Process the Debian install/remove scripts to update the CPACK variables +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) if ( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}" ) @@ -452,8 +452,12 @@ set ( CPACK_RPM_PACKAGE_PROVIDES "hsa-ext-rocr-dev" ) set ( CPACK_RPM_PACKAGE_OBSOLETES "hsa-ext-rocr-dev" ) set ( CPACK_RPM_PACKAGE_CONFLICTS "hsa-ext-rocr-dev" ) -set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) -set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) +## Process the Rpm install/remove scripts to update the CPACK variables +configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) +configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) + +set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" ) +set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" ) ## Include packaging include ( CPack ) diff --git a/src/DEBIAN/post_install b/src/DEBIAN/postinst.in similarity index 92% rename from src/DEBIAN/post_install rename to src/DEBIAN/postinst.in index 1b8ce96ef..6c8a8b652 100644 --- a/src/DEBIAN/post_install +++ b/src/DEBIAN/postinst.in @@ -44,12 +44,16 @@ set -e +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build do_ldconfig() { - echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig + fi } case "$1" in - configure) + ( configure ) do_ldconfig # Workaround for CPACK directory symlink handling error. mkdir -p @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/include @@ -58,7 +62,7 @@ case "$1" in abort-upgrade|abort-remove|abort-deconfigure) echo "$1" ;; - *) + ( * ) exit 0 ;; esac diff --git a/src/DEBIAN/pre_remove b/src/DEBIAN/prerm.in similarity index 91% rename from src/DEBIAN/pre_remove rename to src/DEBIAN/prerm.in index c168ebc4f..aa212593b 100644 --- a/src/DEBIAN/pre_remove +++ b/src/DEBIAN/prerm.in @@ -44,20 +44,23 @@ set -e +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build rm_ldconfig() { - rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig + fi } case "$1" in - remove) + ( remove ) rm_ldconfig # Workaround for CPACK directory symlink handling error. rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/hsa ;; - purge) + ( purge ) ;; - *) + ( * ) exit 0 ;; esac - diff --git a/src/RPM/rpm_post b/src/RPM/post.in similarity index 90% rename from src/RPM/rpm_post rename to src/RPM/post.in index 7fcbaabda..117148233 100644 --- a/src/RPM/rpm_post +++ b/src/RPM/post.in @@ -40,4 +40,8 @@ ## ################################################################################ -echo /opt/rocm/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build +if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig +fi diff --git a/src/RPM/rpm_postun b/src/RPM/postun.in similarity index 91% rename from src/RPM/rpm_postun rename to src/RPM/postun.in index 494844747..801525d68 100644 --- a/src/RPM/rpm_postun +++ b/src/RPM/postun.in @@ -40,6 +40,8 @@ ## ################################################################################ -if [ $1 -eq 0 ]; then - rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build +if [ $1 -eq 0 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then + rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig fi diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index e48f24c54..0ff7b5f8b 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -1151,6 +1151,27 @@ hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal, return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + return amdExtTable->hsa_amd_svm_attributes_set_fn(ptr, size, attribute_list, attribute_count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + return amdExtTable->hsa_amd_svm_attributes_get_fn(ptr, size, attribute_list, attribute_count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + return amdExtTable->hsa_amd_svm_prefetch_async_fn(ptr, size, agent, num_dep_signals, dep_signals, completion_signal); +} + // Tools only table interfaces. namespace rocr { diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index c00005ff0..df1f4f2b9 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -173,7 +173,8 @@ class GpuAgent : public GpuAgentInt { // @param [in] node Node id. Each CPU in different socket will get distinct // id. // @param [in] node_props Node property. - GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props); + // @param [in] xnack_mode XNACK mode of device. + GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode); // @brief GPU agent destructor. ~GpuAgent(); diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h index 68cf52d74..0cf527baf 100644 --- a/src/core/inc/amd_gpu_shaders.h +++ b/src/core/inc/amd_gpu_shaders.h @@ -439,6 +439,32 @@ static const unsigned int kCodeTrapHandler9[] = { 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; +static const unsigned int kCodeTrapHandler90a[] = { + 0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73, + 0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900, + 0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, + 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff, + 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, + 0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f, + 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f, + 0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f, + 0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, + 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, + 0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, + 0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4, + 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e, + 0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, + 0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f, + 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, + 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, + 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, + 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, + 0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, +}; + static const unsigned int kCodeCopyAligned8[] = { 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, 0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050, diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h index e119a939e..cce01e658 100644 --- a/src/core/inc/amd_memory_region.h +++ b/src/core/inc/amd_memory_region.h @@ -95,7 +95,7 @@ class MemoryRegion : public core::MemoryRegion { /// @brief Unpin memory. static void MakeKfdMemoryUnresident(const void* ptr); - MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props); ~MemoryRegion(); diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h index 510e36960..9954b8fc7 100644 --- a/src/core/inc/hsa_ext_amd_impl.h +++ b/src/core/inc/hsa_ext_amd_impl.h @@ -246,6 +246,21 @@ hsa_status_t hsa_amd_deregister_deallocation_callback( hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, volatile hsa_signal_value_t** value_ptr); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + } // namespace amd } // namespace rocr diff --git a/src/core/inc/hsa_ven_amd_loader_impl.h b/src/core/inc/hsa_ven_amd_loader_impl.h index 51ba4289c..82167253a 100644 --- a/src/core/inc/hsa_ven_amd_loader_impl.h +++ b/src/core/inc/hsa_ven_amd_loader_impl.h @@ -78,6 +78,12 @@ namespace rocr { size_t size, hsa_code_object_reader_t *code_object_reader); + hsa_status_t + hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); } // namespace rocr #endif diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h index 583b13a12..4f362fad2 100644 --- a/src/core/inc/memory_region.h +++ b/src/core/inc/memory_region.h @@ -57,8 +57,8 @@ class Agent; class MemoryRegion : public Checked<0x9C961F19EE175BB3> { public: - MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner) - : fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) { + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner) + : fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), owner_(owner) { assert(owner_ != NULL); } @@ -112,12 +112,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { __forceinline bool fine_grain() const { return fine_grain_; } + __forceinline bool kernarg() const { return kernarg_; } + __forceinline bool full_profile() const { return full_profile_; } __forceinline core::Agent* owner() const { return owner_; } private: const bool fine_grain_; + const bool kernarg_; const bool full_profile_; core::Agent* owner_; diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index 634224c5f..d232e6f95 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -292,6 +292,15 @@ class Runtime { hsa_status_t IPCDetach(void* ptr); + hsa_status_t SetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + + hsa_status_t GetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + + hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, hsa_signal_t completion_signal); + const std::vector& cpu_agents() { return cpu_agents_; } const std::vector& gpu_agents() { return gpu_agents_; } @@ -395,6 +404,28 @@ class Runtime { std::vector arg_; }; + struct PrefetchRange; + typedef std::map prefetch_map_t; + + struct PrefetchOp { + void* base; + size_t size; + uint32_t node_id; + int remaining_deps; + hsa_signal_t completion; + std::vector dep_signals; + prefetch_map_t::iterator prefetch_map_entry; + }; + + struct PrefetchRange { + PrefetchRange() {} + PrefetchRange(size_t Bytes, PrefetchOp* Op) : bytes(Bytes), op(Op) {} + size_t bytes; + PrefetchOp* op; + prefetch_map_t::iterator prev; + prefetch_map_t::iterator next; + }; + // Will be created before any user could call hsa_init but also could be // destroyed before incorrectly written programs call hsa_shutdown. static KernelMutex bootstrap_lock_; @@ -444,6 +475,9 @@ class Runtime { /// @retval Index in ::link_matrix_. uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to); + /// @brief Get most recently issued SVM prefetch agent for the range in question. + Agent* GetSVMPrefetchAgent(void* ptr, size_t size); + // Mutex object to protect multithreaded access to ::allocation_map_, // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo // registered & mapped arrays. @@ -485,6 +519,10 @@ class Runtime { // Contains the region, address, and size of previously allocated memory. std::map allocation_map_; + // Pending prefetch containers. + KernelMutex prefetch_lock_; + prefetch_map_t prefetch_map_; + // Allocator using ::system_region_ std::function system_allocator_; diff --git a/src/core/inc/scratch_cache.h b/src/core/inc/scratch_cache.h index e85e22660..1e079babb 100644 --- a/src/core/inc/scratch_cache.h +++ b/src/core/inc/scratch_cache.h @@ -107,7 +107,7 @@ class ScratchCache { ScratchCache& operator=(const ScratchCache& rhs) = delete; ScratchCache& operator=(ScratchCache&& rhs) = delete; - ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {} + ScratchCache(deallocator_t deallocator) : dealloc(deallocator), available_bytes(0) {} ~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); } @@ -122,6 +122,7 @@ class ScratchCache { it->second.alloc(); info.queue_base = it->second.base; info.scratch_node = it; + available_bytes -= it->first; return true; } it++; @@ -136,6 +137,7 @@ class ScratchCache { info.queue_base = it->second.base; info.size = it->first; info.scratch_node = it; + available_bytes -= it->first; return true; } it++; @@ -152,6 +154,8 @@ class ScratchCache { return; } it->second.free(); + available_bytes += it->first; + assert(it->first == info.size && "Scratch cache size mismatch."); } bool trim(bool trim_nodes_in_use) { @@ -159,6 +163,7 @@ class ScratchCache { auto it = map.begin(); while (it != map.end()) { if (it->second.isFree()) { + available_bytes -= it->first; dealloc(it->second.base, it->first, it->second.large); auto temp = it; it++; @@ -181,9 +186,14 @@ class ScratchCache { info.scratch_node = it; } + size_t free_bytes() const { + return available_bytes; + } + private: map_t map; deallocator_t dealloc; + size_t available_bytes; }; } // namespace AMD diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp index 576f66369..ddbfc1a9d 100644 --- a/src/core/runtime/amd_cpu_agent.cpp +++ b/src/core/runtime/amd_cpu_agent.cpp @@ -69,40 +69,31 @@ void CpuAgent::InitRegionList() { std::vector mem_props(properties_.NumMemoryBanks); if (HSAKMT_STATUS_SUCCESS == - hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, - &mem_props[0])) { + hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, &mem_props[0])) { std::vector::iterator system_prop = std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool { return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM); }); - if (system_prop != mem_props.end()) { - MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop); + HsaMemoryProperties system_props; + std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); + system_props.HeapType = HSA_HEAPTYPE_SYSTEM; + system_props.SizeInBytes = 0; + system_props.VirtualBaseAddress = 0; - regions_.push_back(system_region_fine); + if (system_prop != mem_props.end()) system_props = *system_prop; - if (!is_apu_node) { - MemoryRegion* system_region_coarse = - new MemoryRegion(false, is_apu_node, this, *system_prop); + MemoryRegion* system_region_fine = + new MemoryRegion(true, false, is_apu_node, this, system_props); + regions_.push_back(system_region_fine); + MemoryRegion* system_region_kernarg = + new MemoryRegion(true, true, is_apu_node, this, system_props); + regions_.push_back(system_region_kernarg); - regions_.push_back(system_region_coarse); - } - } else { - HsaMemoryProperties system_props; - std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); - - system_props.HeapType = HSA_HEAPTYPE_SYSTEM; - system_props.SizeInBytes = 0; - system_props.VirtualBaseAddress = 0; - - MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props); - regions_.push_back(system_region_fine); - - if (!is_apu_node) { - MemoryRegion* system_region_coarse = - new MemoryRegion(false, is_apu_node, this, system_props); - regions_.push_back(system_region_coarse); - } + if (!is_apu_node) { + MemoryRegion* system_region_coarse = + new MemoryRegion(false, false, is_apu_node, this, system_props); + regions_.push_back(system_region_coarse); } } } diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 06265bda2..eb1e010cb 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -77,7 +77,7 @@ extern HsaApiTable hsa_internal_api_table_; } // namespace core namespace AMD { -GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) +GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode) : GpuAgentInt(node), properties_(node_props), current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), @@ -112,16 +112,15 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported; if (isa_base->IsSrameccSupported()) { - sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 - ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; } rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported; if (isa_base->IsXnackSupported()) { // TODO: This needs to be obtained form KFD once HMM implemented. - xnack = profile_ == HSA_PROFILE_FULL ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + xnack = xnack_mode ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; } // Set instruction set architecture via node property, only on GPU device. @@ -202,6 +201,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_7; ASICShader compute_8; ASICShader compute_9; + ASICShader compute_90a; ASICShader compute_1010; ASICShader compute_10; }; @@ -212,6 +212,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, + {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, }}, @@ -220,6 +221,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, }}, @@ -228,6 +230,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, }}, @@ -236,6 +239,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeFill7, sizeof(kCodeFill7), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, }}}; @@ -254,6 +258,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar asic_shader = &compiled_shader_it->second.compute_8; break; case 9: + if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) + asic_shader = &compiled_shader_it->second.compute_90a; + else asic_shader = &compiled_shader_it->second.compute_9; break; case 10: @@ -302,6 +309,14 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2); AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1); + + if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) && + (isa_->GetStepping() == 10)) { + // Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a. + // FIXME: Assemble code objects from source at build time + int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1; + header->max_scratch_backing_memory_byte_size = uint64_t(gran_accvgprs) << 32; + } } // Copy shader code into the GPU-visible buffer. @@ -338,8 +353,7 @@ void GpuAgent::InitRegionList() { memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax; case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { - MemoryRegion* region = - new MemoryRegion(false, false, this, mem_props[mem_idx]); + MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]); regions_.push_back(region); @@ -348,7 +362,7 @@ void GpuAgent::InitRegionList() { // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. if ((properties_.HiveID != 0) || (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) { - regions_.push_back(new MemoryRegion(true, false, this, mem_props[mem_idx])); + regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx])); } } break; @@ -816,12 +830,16 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AGENT_INFO_DEVICE: *((hsa_device_type_t*)value) = HSA_DEVICE_TYPE_GPU; break; - case HSA_AGENT_INFO_CACHE_SIZE: + case HSA_AGENT_INFO_CACHE_SIZE: { std::memset(value, 0, sizeof(uint32_t) * 4); - // TODO: no GPU cache info from KFD. Hardcode for now. - // GCN whitepaper: L1 data cache is 16KB. - ((uint32_t*)value)[0] = 16 * 1024; - break; + assert(cache_props_.size() > 0 && "GPU cache info missing."); + const size_t num_cache = cache_props_.size(); + for (size_t i = 0; i < num_cache; ++i) { + const uint32_t line_level = cache_props_[i].CacheLevel; + if (reinterpret_cast(value)[line_level - 1] == 0) + reinterpret_cast(value)[line_level - 1] = cache_props_[i].CacheSize * 1024; + } + } break; case HSA_AGENT_INFO_ISA: *((hsa_isa_t*)value) = core::Isa::Handle(isa_); break; @@ -1089,12 +1107,13 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single allocation. */ + ScopedAcquire lock(&scratch_lock_); size_t small_limit = scratch_pool_.size() >> 3; // Lift limit for 2.10 release RCCL workaround. size_t single_limit = 146800640; //small_limit >> 2; bool use_reclaim = true; bool large = (scratch.size > single_limit) || - (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); + (scratch_pool_.size() - scratch_pool_.remaining() - scratch_cache_.free_bytes() + scratch.size > small_limit); if ((isa_->GetMajorVersion() < 8) || core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) { large = false; @@ -1122,8 +1141,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { // Lambda called in place. // Used to allow exit from nested loops. [&]() { - ScopedAcquire lock(&scratch_lock_); // Check scratch cache + scratch.large = large; if (scratch_cache_.alloc(scratch)) return; // Attempt new allocation. diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index fca439866..4419286fd 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -100,9 +100,9 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } -MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, +MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props) - : core::MemoryRegion(fine_grain, full_profile, owner), + : core::MemoryRegion(fine_grain, kernarg, full_profile, owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0), @@ -128,6 +128,8 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; + if (kernarg) mem_flag_.ui32.Uncached = 1; + virtual_size_ = (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } @@ -290,16 +292,14 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_GLOBAL_FLAGS: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: - *((uint32_t*)value) = fine_grain() - ? (HSA_REGION_GLOBAL_FLAG_KERNARG | - HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; - break; - case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: - *((uint32_t*)value) = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: { + uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED + : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG; + *((uint32_t*)value) = ret; break; + } default: *((uint32_t*)value) = 0; break; diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp index 185838489..71bc73821 100644 --- a/src/core/runtime/amd_topology.cpp +++ b/src/core/runtime/amd_topology.cpp @@ -68,6 +68,38 @@ namespace AMD { static const uint kKfdVersionMajor = 0; static const uint kKfdVersionMinor = 99; +// Query for user preference and use that to determine Xnack mode of ROCm system. +// Return true if Xnack mode is ON or false if OFF. Xnack mode of a system is +// orthogonal to devices that do not support Xnack mode. It is legal for a +// system with Xnack ON to have devices that do not support Xnack functionality. +bool BindXnackMode() { + // Get users' preference for Xnack mode of ROCm platform + HSAint32 mode; + mode = core::Runtime::runtime_singleton_->flag().xnack(); + bool config_xnack = + (core::Runtime::runtime_singleton_->flag().xnack() != Flag::XNACK_REQUEST::XNACK_UNCHANGED); + + // Indicate to driver users' preference for Xnack mode + // Call to driver can fail and is a supported feature + HSAKMT_STATUS status = HSAKMT_STATUS_ERROR; + if (config_xnack) { + status = hsaKmtSetXNACKMode(mode); + if (status == HSAKMT_STATUS_SUCCESS) { + return mode; + } + } + + // Get Xnack mode of devices bound by driver. This could happen + // when a call to SET Xnack mode fails or user has no particular + // preference + status = hsaKmtGetXNACKMode((HSAint32*)&mode); + if(status != HSAKMT_STATUS_SUCCESS) { + debug_print("KFD does not support xnack mode query.\nROCr must assume xnack is disabled.\n"); + return false; + } + return mode; +} + CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (node_prop.NumCPUCores == 0) { return nullptr; @@ -79,14 +111,14 @@ CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { return cpu; } -GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { +GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnack_mode) { GpuAgent* gpu = nullptr; if (node_prop.NumFComputeCores == 0) { // Ignore non GPUs. return nullptr; } try { - gpu = new GpuAgent(node_id, node_prop); + gpu = new GpuAgent(node_id, node_prop, xnack_mode); const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion(); @@ -111,7 +143,7 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (gpu->isa()->GetProcessorName() == "gfx908") { node_prop.Capability.ui32.SRAM_EDCSupport = 1; delete gpu; - gpu = new GpuAgent(node_id, node_prop); + gpu = new GpuAgent(node_id, node_prop, xnack_mode); } } } catch (const hsa_exception& e) { @@ -174,20 +206,28 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { link_info.atomic_support_32bit = true; link_info.atomic_support_64bit = true; link_info.coherent_support = true; + if (core::Runtime::runtime_singleton_->flag().patch_xgmi_link_weight()) { + if (io_link.Weight == 0) { + io_link.Weight = 15; + } + } break; default: debug_print("Unrecognized IOLINK type.\n"); break; } - if (io_link.Flags.ui32.Override == 1) { - if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { - // Ignore this link since peer to peer is not allowed. - continue; + // KFD is reporting wrong override status for XGMI. Disallow override for bringup. + if (!core::Runtime::runtime_singleton_->flag().patch_link_override()) { + if (io_link.Flags.ui32.Override == 1) { + if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { + // Ignore this link since peer to peer is not allowed. + continue; + } + link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); + link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); + link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); } - link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); - link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); - link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); } link_info.max_bandwidth = io_link.MaximumBandwidth; @@ -204,7 +244,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { /** * Process the list of Gpus that are surfaced to user */ -static void SurfaceGpuList(std::vector& gpu_list) { +static void SurfaceGpuList(std::vector& gpu_list, bool xnack_mode) { // Process user visible Gpu devices int32_t invalidIdx = -1; int32_t list_sz = gpu_list.size(); @@ -221,7 +261,7 @@ static void SurfaceGpuList(std::vector& gpu_list) { // Instantiate a Gpu device. The IO links // of this node have already been registered assert((node_prop.NumFComputeCores != 0) && "Improper node used for GPU device discovery."); - DiscoverGpu(gpu_list[idx], node_prop); + DiscoverGpu(gpu_list[idx], node_prop, xnack_mode); } } @@ -305,8 +345,11 @@ void BuildTopology() { RegisterLinkInfo(node_id, node_prop.NumIOLinks); } + // Determine the Xnack mode to be bound for system + bool xnack_mode = BindXnackMode(); + // Instantiate ROCr objects to encapsulate Gpu devices - SurfaceGpuList(gpu_usr_list); + SurfaceGpuList(gpu_usr_list, xnack_mode); } bool Load() { diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index e9f72adb3..f496f2ca9 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -341,6 +341,8 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin {"hsa_ext_finalizer_1_00_pfn_t", sizeof(hsa_ext_finalizer_1_00_pfn_t)}, {"hsa_ven_amd_loader_1_00_pfn_t", sizeof(hsa_ven_amd_loader_1_00_pfn_t)}, {"hsa_ven_amd_loader_1_01_pfn_t", sizeof(hsa_ven_amd_loader_1_01_pfn_t)}, + {"hsa_ven_amd_loader_1_02_pfn_t", sizeof(hsa_ven_amd_loader_1_02_pfn_t)}, + {"hsa_ven_amd_loader_1_03_pfn_t", sizeof(hsa_ven_amd_loader_1_03_pfn_t)}, {"hsa_ven_amd_aqlprofile_1_00_pfn_t", sizeof(hsa_ven_amd_aqlprofile_1_00_pfn_t)}}; static const size_t num_tables = sizeof(sizes) / sizeof(sizes_t); @@ -448,15 +450,21 @@ hsa_status_t hsa_system_get_major_extension_table(uint16_t extension, uint16_t v if (extension == HSA_EXTENSION_AMD_LOADER) { if (version_major != 1) return HSA_STATUS_ERROR; - hsa_ven_amd_loader_1_01_pfn_t ext_table; - ext_table.hsa_ven_amd_loader_query_host_address = hsa_ven_amd_loader_query_host_address; + hsa_ven_amd_loader_1_03_pfn_t ext_table; + ext_table.hsa_ven_amd_loader_query_host_address = + hsa_ven_amd_loader_query_host_address; ext_table.hsa_ven_amd_loader_query_segment_descriptors = hsa_ven_amd_loader_query_segment_descriptors; - ext_table.hsa_ven_amd_loader_query_executable = hsa_ven_amd_loader_query_executable; + ext_table.hsa_ven_amd_loader_query_executable = + hsa_ven_amd_loader_query_executable; ext_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects = hsa_ven_amd_loader_executable_iterate_loaded_code_objects; ext_table.hsa_ven_amd_loader_loaded_code_object_get_info = hsa_ven_amd_loader_loaded_code_object_get_info; + ext_table.hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size = + hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size; + ext_table.hsa_ven_amd_loader_iterate_executables = + hsa_ven_amd_loader_iterate_executables; memcpy(table, &ext_table, Min(sizeof(ext_table), table_length)); @@ -1547,7 +1555,7 @@ hsa_status_t hsa_agent_iterate_isas( const Isa *isa_object = agent_object->isa(); if (!isa_object) { - return HSA_STATUS_ERROR_INVALID_AGENT; + return HSA_STATUS_SUCCESS; } return callback(Isa::Handle(isa_object), data); @@ -1894,9 +1902,13 @@ static std::string ConvertOldTargetNameToNew( NewName = "amdgcn-amd-amdhsa--gfx906"; xnack_supported = true; } + else if (OldName == "AMD:AMDGPU:9:0:12") { + NewName = "amdgcn-amd-amdhsa--gfx90c"; + xnack_supported = true; + } else { - // Code object v2 only supports asics up to gfx906. Do NOT add handling - // of new asics into this if-else-if* block. + // Code object v2 only supports asics up to gfx906 plus gfx90c. Do NOT add + // handling of new asics into this if-else-if* block. return ""; } diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp index 9e50971b3..605ec15ae 100644 --- a/src/core/runtime/hsa_api_trace.cpp +++ b/src/core/runtime/hsa_api_trace.cpp @@ -391,6 +391,9 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback; amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback; amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer; + amd_ext_api.hsa_amd_svm_attributes_set_fn = AMD::hsa_amd_svm_attributes_set; + amd_ext_api.hsa_amd_svm_attributes_get_fn = AMD::hsa_amd_svm_attributes_get; + amd_ext_api.hsa_amd_svm_prefetch_async_fn = AMD::hsa_amd_svm_prefetch_async; } void LoadInitialHsaApiTable() { diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index f4776eb99..66d7ff727 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -1009,5 +1009,37 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie CATCH; } +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + TRY; + IS_OPEN(); + return core::Runtime::runtime_singleton_->SetSvmAttrib(ptr, size, attribute_list, + attribute_count); + CATCH; +} + +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + TRY; + IS_OPEN(); + return core::Runtime::runtime_singleton_->GetSvmAttrib(ptr, size, attribute_list, + attribute_count); + CATCH; +} + +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + TRY; + IS_OPEN(); + // Validate inputs. + // if (core::g_use_interrupt_wait && (!core::InterruptSignal::IsType(signal))) + return core::Runtime::runtime_singleton_->SvmPrefetch(ptr, size, agent, num_dep_signals, + dep_signals, completion_signal); + CATCH; +} + } // namespace amd } // namespace rocr diff --git a/src/core/runtime/hsa_ven_amd_loader.cpp b/src/core/runtime/hsa_ven_amd_loader.cpp index a74181d3b..c94563715 100644 --- a/src/core/runtime/hsa_ven_amd_loader.cpp +++ b/src/core/runtime/hsa_ven_amd_loader.cpp @@ -53,6 +53,7 @@ using namespace core; using loader::CodeObjectReaderImpl; using loader::Executable; using loader::LoadedCodeObject; +using loader::Loader; namespace AMD { @@ -273,4 +274,30 @@ hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( } catch(...) { return AMD::handleException(); } } +namespace { + +Loader *GetLoader() { + return Runtime::runtime_singleton_->loader(); +} + +} // namespace anonymous + +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) { + try { + if (!Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == callback) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return GetLoader()->IterateExecutables(callback, data); + } catch(...) { return AMD::handleException(); } +} + } // namespace rocr diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 8b889bb5d..b49dd65d9 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -281,6 +281,21 @@ constexpr size_t hsa_name_size = 63; ISAREG_ENTRY_GEN("gfx908:sramecc-:xnack+", 9, 0, 8, disabled, enabled) ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack-", 9, 0, 8, enabled, disabled) ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack+", 9, 0, 8, enabled, enabled) + ISAREG_ENTRY_GEN("gfx909", 9, 0, 9, unsupported, any) + ISAREG_ENTRY_GEN("gfx909:xnack-", 9, 0, 9, unsupported, disabled) + ISAREG_ENTRY_GEN("gfx909:xnack+", 9, 0, 9, unsupported, enabled) + ISAREG_ENTRY_GEN("gfx90a", 9, 0, 10, any, any) + ISAREG_ENTRY_GEN("gfx90a:xnack-", 9, 0, 10, any, disabled) + ISAREG_ENTRY_GEN("gfx90a:xnack+", 9, 0, 10, any, enabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc-", 9, 0, 10, disabled, any) + ISAREG_ENTRY_GEN("gfx90a:sramecc+", 9, 0, 10, enabled, any) + ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack-", 9, 0, 10, disabled, disabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack+", 9, 0, 10, disabled, enabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack-", 9, 0, 10, enabled, disabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack+", 9, 0, 10, enabled, enabled) + ISAREG_ENTRY_GEN("gfx90c", 9, 0, 12, unsupported, any) + ISAREG_ENTRY_GEN("gfx90c:xnack-", 9, 0, 12, unsupported, disabled) + ISAREG_ENTRY_GEN("gfx90c:xnack+", 9, 0, 12, unsupported, enabled) ISAREG_ENTRY_GEN("gfx1010", 10, 1, 0, unsupported, any) ISAREG_ENTRY_GEN("gfx1010:xnack-", 10, 1, 0, unsupported, disabled) ISAREG_ENTRY_GEN("gfx1010:xnack+", 10, 1, 0, unsupported, enabled) diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index c3cce57a1..93ad27186 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -60,6 +60,7 @@ #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/hsa_api_trace_int.h" #include "core/util/os.h" +#include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" #define HSA_VERSION_MAJOR 1 @@ -173,16 +174,29 @@ void Runtime::RegisterAgent(Agent* agent) { if (cpu_agents_.size() == 1) { // Might need memory pooling to cover allocation that // requires less than 4096 bytes. - system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* { - assert(align <= 4096); - void* ptr = nullptr; - core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr); - return ptr; - }; - - system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; - BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + // Default system pool must support kernarg + for (auto pool : system_regions_fine_) { + if (pool->kernarg()) { + system_allocator_ = [pool](size_t size, size_t alignment, + MemoryRegion::AllocateFlags alloc_flags) -> void* { + assert(alignment <= 4096); + void* ptr = NULL; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, + &ptr)) + ? ptr + : NULL; + }; + + system_deallocator_ = [](void* ptr) { + core::Runtime::runtime_singleton_->FreeMemory(ptr); + }; + + BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + break; + } + } } } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) { gpu_agents_.push_back(agent); @@ -630,6 +644,22 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { *(const char**)value = STRING(ROCR_BUILD_ID); break; } + case HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: { + bool ret = true; + for (auto agent : gpu_agents_) { + AMD::GpuAgent* gpu = (AMD::GpuAgent*)agent; + ret &= (gpu->properties().Capability.ui32.SVMAPISupported == 1); + } + *(bool*)value = ret; + break; + } + case HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: { + bool ret = true; + for(auto agent : gpu_agents_) + ret &= (agent->isa()->GetXnack() == IsaFeature::Enabled); + *(bool*)value = ret; + break; + } default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -1021,15 +1051,52 @@ void Runtime::AsyncEventsLoop(void*) { if (index == 0) { hsa_signal_handle(async_events_control_.wake)->StoreRelaxed(0); } else if (index != -1) { - // No error or timout occured, process the handler + // No error or timout occured, process the handlers + // Call handler for the known satisfied signal. assert(async_events_.handler_[index] != NULL); - bool keep = - async_events_.handler_[index](value, async_events_.arg_[index]); + bool keep = async_events_.handler_[index](value, async_events_.arg_[index]); if (!keep) { hsa_signal_handle(async_events_.signal_[index])->Release(); async_events_.CopyIndex(index, async_events_.Size() - 1); async_events_.PopBack(); } + // Check remaining signals before sleeping. + for (size_t i = index; i < async_events_.Size(); i++) { + hsa_signal_handle sig(async_events_.signal_[i]); + + value = atomic::Load(&sig->signal_.value, std::memory_order_relaxed); + bool condition_met = false; + + switch (async_events_.cond_[i]) { + case HSA_SIGNAL_CONDITION_EQ: { + condition_met = (value == async_events_.value_[i]); + break; + } + case HSA_SIGNAL_CONDITION_NE: { + condition_met = (value != async_events_.value_[i]); + break; + } + case HSA_SIGNAL_CONDITION_GTE: { + condition_met = (value >= async_events_.value_[i]); + break; + } + case HSA_SIGNAL_CONDITION_LT: { + condition_met = (value < async_events_.value_[i]); + break; + } + } + + if (condition_met) { + assert(async_events_.handler_[i] != NULL); + bool keep = async_events_.handler_[i](value, async_events_.arg_[i]); + if (!keep) { + hsa_signal_handle(async_events_.signal_[i])->Release(); + async_events_.CopyIndex(i, async_events_.Size() - 1); + async_events_.PopBack(); + i--; + } + } + } } // Check for dead signals @@ -1431,8 +1498,7 @@ void Runtime::LoadTools() { if (tool != NULL) { tool_libs_.push_back(tool); - tool_init_t ld; - ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad"); + rocr::AMD::callback_t ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad"); if (ld) { if (!ld(&hsa_api_table_.hsa_api, hsa_api_table_.hsa_api.version.major_id, @@ -1443,8 +1509,8 @@ void Runtime::LoadTools() { } } - tool_wrap_t wrap; - wrap = (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent"); + rocr::AMD::callback_t wrap = + (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent"); if (wrap) { std::vector* agent_lists[2] = {&cpu_agents_, &gpu_agents_}; @@ -1461,8 +1527,7 @@ void Runtime::LoadTools() { } } - tool_add_t add; - add = (tool_add_t)os::GetExportAddress(tool, "AddAgent"); + rocr::AMD::callback_t add = (tool_add_t)os::GetExportAddress(tool, "AddAgent"); if (add) add(this); } else { @@ -1572,5 +1637,499 @@ void Runtime::InternalQueueCreateNotify(const hsa_queue_t* queue, hsa_agent_t ag internal_queue_create_notifier_(queue, agent, internal_queue_create_notifier_user_data_); } +hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + uint32_t set_attribs = 0; + std::vector agent_seen(agents_by_node_.size(), false); + + std::vector attribs; + attribs.reserve(attribute_count); + uint32_t set_flags = 0; + uint32_t clear_flags = 0; + + auto Convert = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent == nullptr) || !agent->IsValid()) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::SetSvmAttrib."); + return agent; + }; + + auto ConvertAllowNull = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent != nullptr) && (!agent->IsValid())) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::SetSvmAttrib."); + return agent; + }; + + auto ConfirmNew = [&](Agent* agent) { + if (agent_seen[agent->node_id()]) + throw AMD::hsa_exception( + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + "Multiple attributes given for the same agent in Runtime::SetSvmAttrib."); + agent_seen[agent->node_id()] = true; + }; + + auto Check = [&](uint64_t attrib) { + if (set_attribs & (1 << attrib)) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + "Attribute given multiple times in Runtime::SetSvmAttrib."); + set_attribs |= (1 << attrib); + }; + + auto kmtPair = [](uint32_t attrib, uint32_t value) { + HSA_SVM_ATTRIBUTE pair = {attrib, value}; + return pair; + }; + + for (uint32_t i = 0; i < attribute_count; i++) { + auto attrib = attribute_list[i].attribute; + auto value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: { + Check(attrib); + switch (value) { + case HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED: + set_flags |= HSA_SVM_FLAG_COHERENT; + break; + case HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED: + clear_flags |= HSA_SVM_FLAG_COHERENT; + break; + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Invalid HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG value."); + } + break; + } + case HSA_AMD_SVM_ATTRIB_READ_ONLY: { + Check(attrib); + if (value) + set_flags |= HSA_SVM_FLAG_GPU_RO; + else + clear_flags |= HSA_SVM_FLAG_GPU_RO; + break; + } + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + Check(attrib); + if (value) + set_flags |= HSA_SVM_FLAG_HIVE_LOCAL; + else + clear_flags |= HSA_SVM_FLAG_HIVE_LOCAL; + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + Check(attrib); + // Max migration size is 1GB. + if (value > 18) value = 18; + attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, value)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + Check(attrib); + Agent* agent = ConvertAllowNull(value); + if (agent == nullptr) + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, INVALID_NODEID)); + else + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, agent->node_id())); + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + set_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id())); + } + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + set_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS_IN_PLACE, agent->node_id())); + } + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + clear_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_NO_ACCESS, agent->node_id())); + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::SetSvmAttrib"); + } + } + + // Merge CPU access properties - grant access if any CPU needs access. + // Probably wrong. + if (set_flags & HSA_SVM_FLAG_HOST_ACCESS) clear_flags &= ~HSA_SVM_FLAG_HOST_ACCESS; + + // Add flag updates + if (clear_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_CLR_FLAGS, clear_flags)); + if (set_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, set_flags)); + + uint8_t* base = AlignDown((uint8_t*)ptr, 4096); + uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096); + size_t len = end - base; + HSAKMT_STATUS error = hsaKmtSVMSetAttr(base, len, attribs.size(), &attribs[0]); + if (error != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMSetAttr failed."); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::GetSvmAttrib(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + std::vector attribs; + attribs.reserve(attribute_count); + + std::vector kmtIndices(attribute_count); + + bool getFlags = false; + + auto Convert = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent == nullptr) || !agent->IsValid()) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::GetSvmAttrib."); + return agent; + }; + + auto kmtPair = [](uint32_t attrib, uint32_t value) { + HSA_SVM_ATTRIBUTE pair = {attrib, value}; + return pair; + }; + + for (uint32_t i = 0; i < attribute_count; i++) { + auto& attrib = attribute_list[i].attribute; + auto& value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: + case HSA_AMD_SVM_ATTRIB_READ_ONLY: + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + getFlags = true; + kmtIndices[i] = -1; + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, 0)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, 0)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: { + value = Agent::Convert(GetSVMPrefetchAgent(ptr, size)).handle; + kmtIndices[i] = -1; + break; + } + case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: { + Agent* agent = Convert(value); + if (agent->device_type() == Agent::kAmdCpuDevice) { + getFlags = true; + kmtIndices[i] = -1; + } else { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id())); + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::SetSvmAttrib"); + } + } + + if (getFlags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, 0)); + + uint8_t* base = AlignDown((uint8_t*)ptr, 4096); + uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096); + size_t len = end - base; + if (attribs.size() != 0) { + HSAKMT_STATUS error = hsaKmtSVMGetAttr(base, len, attribs.size(), &attribs[0]); + if (error != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMGetAttr failed."); + } + + for (uint32_t i = 0; i < attribute_count; i++) { + auto& attrib = attribute_list[i].attribute; + auto& value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: { + if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_COHERENT) + value = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED; + else + value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED; + break; + } + case HSA_AMD_SVM_ATTRIB_READ_ONLY: { + value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_GPU_RO); + break; + } + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HIVE_LOCAL); + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + value = attribs[kmtIndices[i]].value; + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + uint64_t node = attribs[kmtIndices[i]].value; + Agent* agent = nullptr; + if (node != INVALID_NODEID) agent = agents_by_node_[node][0]; + value = Agent::Convert(agent).handle; + break; + } + case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: { + break; + } + case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: { + if (kmtIndices[i] == -1) { + if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HOST_ACCESS) + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + } else { + switch (attribs[kmtIndices[i]].type) { + case HSA_SVM_ATTR_ACCESS: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + break; + case HSA_SVM_ATTR_ACCESS_IN_PLACE: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE; + break; + case HSA_SVM_ATTR_NO_ACCESS: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS; + break; + default: + assert(false && "Bad agent accessibility from KFD."); + } + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::GetSvmAttrib"); + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + uintptr_t base = reinterpret_cast(AlignDown(ptr, 4096)); + uintptr_t end = AlignUp(reinterpret_cast(ptr) + size, 4096); + size_t len = end - base; + + PrefetchOp* op = new PrefetchOp(); + MAKE_NAMED_SCOPE_GUARD(OpGuard, [&]() { delete op; }); + + Agent* dest = Agent::Convert(agent); + if (dest->device_type() == Agent::kAmdCpuDevice) + op->node_id = 0; + else + op->node_id = dest->node_id(); + + op->base = reinterpret_cast(base); + op->size = len; + op->completion = completion_signal; + if (num_dep_signals > 1) { + op->remaining_deps = num_dep_signals - 1; + for (int i = 0; i < num_dep_signals - 1; i++) op->dep_signals.push_back(dep_signals[i]); + } else { + op->remaining_deps = 0; + } + + { + ScopedAcquire lock(&prefetch_lock_); + // Remove all fully overlapped and trim partially overlapped ranges. + // Get iteration bounds + auto start = prefetch_map_.upper_bound(base); + if (start != prefetch_map_.begin()) start--; + auto stop = prefetch_map_.lower_bound(end); + + auto isEndNode = [&](decltype(start) node) { return node->second.next == prefetch_map_.end(); }; + auto isFirstNode = [&](decltype(start) node) { + return node->second.prev == prefetch_map_.end(); + }; + + // Trim and remove old ranges. + while (start != stop) { + uintptr_t startBase = start->first; + uintptr_t startEnd = startBase + start->second.bytes; + + auto ibase = Max(startBase, base); + auto iend = Min(startEnd, end); + // Check for overlap + if (ibase < iend) { + // Second range check + if (iend < startEnd) { + auto ret = prefetch_map_.insert( + std::make_pair(iend, PrefetchRange(startEnd - iend, start->second.op))); + assert(ret.second && "Prefetch map insert failed during range split."); + + auto it = ret.first; + it->second.prev = start; + it->second.next = start->second.next; + start->second.next = it; + if (!isEndNode(it)) it->second.next->second.prev = it; + } + + // Is the first interval of the old range valid + if (startBase < ibase) { + start->second.bytes = ibase - startBase; + } else { + if (isFirstNode(start)) { + start->second.op->prefetch_map_entry = start->second.next; + if (!isEndNode(start)) start->second.next->second.prev = prefetch_map_.end(); + } else { + start->second.prev->second.next = start->second.next; + if (!isEndNode(start)) start->second.next->second.prev = start->second.prev; + } + prefetch_map_.erase(start); + } + } + start++; + } + + // Insert new range. + auto ret = prefetch_map_.insert(std::make_pair(base, PrefetchRange(len, op))); + assert(ret.second && "Prefetch map insert failed."); + + auto it = ret.first; + op->prefetch_map_entry = it; + it->second.next = it->second.prev = prefetch_map_.end(); + } + + // Remove the prefetch's ranges from the map. + static auto removePrefetchRanges = [](PrefetchOp* op) { + ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + auto it = op->prefetch_map_entry; + while (it != Runtime::runtime_singleton_->prefetch_map_.end()) { + auto next = it->second.next; + Runtime::runtime_singleton_->prefetch_map_.erase(it); + it = next; + } + }; + + // Prefetch Signal handler for synchronization. + static hsa_amd_signal_handler signal_handler = [](hsa_signal_value_t value, void* arg) { + PrefetchOp* op = reinterpret_cast(arg); + + if (op->remaining_deps > 0) { + op->remaining_deps--; + Runtime::runtime_singleton_->SetAsyncSignalHandler( + op->dep_signals[op->remaining_deps], HSA_SIGNAL_CONDITION_EQ, 0, signal_handler, arg); + return false; + } + + HSA_SVM_ATTRIBUTE attrib; + attrib.type = HSA_SVM_ATTR_PREFETCH_LOC; + attrib.value = op->node_id; + HSAKMT_STATUS error = hsaKmtSVMSetAttr(op->base, op->size, 1, &attrib); + assert(error == HSAKMT_STATUS_SUCCESS && "KFD Prefetch failed."); + + removePrefetchRanges(op); + + if (op->completion.handle != 0) Signal::Convert(op->completion)->SubRelaxed(1); + delete op; + + return false; + }; + + auto no_dependencies = [](void* arg) { signal_handler(0, arg); }; + + MAKE_NAMED_SCOPE_GUARD(RangeGuard, [&]() { removePrefetchRanges(op); }); + + hsa_status_t err; + if (num_dep_signals == 0) + err = AMD::hsa_amd_async_function(no_dependencies, op); + else + err = SetAsyncSignalHandler(dep_signals[num_dep_signals - 1], HSA_SIGNAL_CONDITION_EQ, 0, + signal_handler, op); + if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Signal handler unable to be set."); + + RangeGuard.Dismiss(); + OpGuard.Dismiss(); + return HSA_STATUS_SUCCESS; +} + +Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) { + uintptr_t base = reinterpret_cast(AlignDown(ptr, 4096)); + uintptr_t end = AlignUp(reinterpret_cast(ptr) + size, 4096); + size_t len = end - base; + + std::vector> holes; + + ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + auto start = prefetch_map_.upper_bound(base); + if (start != prefetch_map_.begin()) start--; + auto stop = prefetch_map_.lower_bound(end); + + // KFD returns -1 for no or mixed destinations. + uint32_t prefetch_node = -2; + if (start != stop) { + prefetch_node = start->second.op->node_id; + } + + while (start != stop) { + uintptr_t startBase = start->first; + uintptr_t startEnd = startBase + start->second.bytes; + + auto ibase = Max(base, startBase); + auto iend = Min(end, startEnd); + // Check for intersection with the query + if (ibase < iend) { + // If prefetch locations are different then we report null agent. + if (prefetch_node != start->second.op->node_id) return nullptr; + + // Push leading gap to an array for checking KFD. + if (base < ibase) holes.push_back(std::make_pair(base, ibase - base)); + + // Trim query range. + base = iend; + } + start++; + } + if (base < end) holes.push_back(std::make_pair(base, end - base)); + + HSA_SVM_ATTRIBUTE attrib; + attrib.type = HSA_SVM_ATTR_PREFETCH_LOC; + for (auto& range : holes) { + HSAKMT_STATUS error = + hsaKmtSVMGetAttr(reinterpret_cast(range.first), range.second, 1, &attrib); + assert(error == HSAKMT_STATUS_SUCCESS && "KFD prefetch query failed."); + + if (attrib.value == -1) return nullptr; + if (prefetch_node == -2) prefetch_node = attrib.value; + if (prefetch_node != attrib.value) return nullptr; + } + + assert(prefetch_node != -2 && "prefetch_node was not updated."); + assert(prefetch_node != -1 && "Should have already returned."); + return agents_by_node_[prefetch_node][0]; +} + } // namespace core } // namespace rocr diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 5013dd1a8..88fbe93be 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -56,6 +56,11 @@ class Flag { public: enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT }; + // The values are meaningful and chosen to satisfy the thunk API. + enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 }; + static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed."); + static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed."); + explicit Flag() { Refresh(); } virtual ~Flag() {} @@ -104,6 +109,12 @@ class Flag { var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR"); disable_fragment_alloc_ = (var == "1") ? true : false; + var = os::GetEnvVar("HSA_UNPATCH_XGMI_LINK_WEIGHT"); + patch_xgmi_link_weight_ = (var == "1") ? false : true; + + var = os::GetEnvVar("HSA_UNPATCH_LINK_OVERRIDE"); + patch_link_override_ = (var == "1") ? false : true; + var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH"); enable_sdma_hdp_flush_ = (var == "0") ? false : true; @@ -130,6 +141,11 @@ class Flag { var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT"); check_sramecc_validity_ = (var == "1") ? false : true; + + // Legal values are zero "0" or one "1". Any other value will + // be interpreted as not defining the env variable. + var = os::GetEnvVar("HSA_XNACK"); + xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED); } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -150,6 +166,11 @@ class Flag { bool disable_fragment_alloc() const { return disable_fragment_alloc_; } + // Temporary way to control ROCr interpretation of inter-device link weight + bool patch_xgmi_link_weight() const { return patch_xgmi_link_weight_; } + + bool patch_link_override() const { return patch_link_override_; } + bool rev_copy_dir() const { return rev_copy_dir_; } bool fine_grain_pcie() const { return fine_grain_pcie_; } @@ -178,6 +199,8 @@ class Flag { bool check_sramecc_validity() const { return check_sramecc_validity_; } + XNACK_REQUEST xnack() const { return xnack_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -195,6 +218,8 @@ class Flag { bool disable_image_; bool loader_enable_mmap_uri_; bool check_sramecc_validity_; + bool patch_xgmi_link_weight_; + bool patch_link_override_; SDMA_OVERRIDE enable_sdma_; @@ -209,6 +234,9 @@ class Flag { size_t force_sdma_size_; + // Indicates user preference for Xnack state. + XNACK_REQUEST xnack_; + DISALLOW_COPY_AND_ASSIGN(Flag); }; diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp index e60036929..6c2761151 100644 --- a/src/core/util/lnx/os_linux.cpp +++ b/src/core/util/lnx/os_linux.cpp @@ -74,7 +74,7 @@ void* __stdcall ThreadTrampoline(void* arg) { void* Data = ar->entry_args; delete ar; CallMe(Data); - return NULL; + return nullptr; } // Thread container allows multiple waits and separate close (destroy). @@ -99,7 +99,16 @@ class os_thread { assert(err == 0 && "pthread_attr_setstacksize failed."); } - int err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + int cores = get_nprocs_conf(); + cpu_set_t* cpuset = CPU_ALLOC(cores); + for(int i=0; i> 8; } -uint32_t MinorVerFromDevID(uint32_t dev_id) { - return (dev_id % 100)/10; -} +uint32_t MinorVerFromDevID(uint32_t dev_id) { return (dev_id >> 4) & 0xF; } -uint32_t StepFromDevID(uint32_t dev_id) { - return (dev_id%100)%10; -} +uint32_t StepFromDevID(uint32_t dev_id) { return dev_id & 0xF; } hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) { char asic_name[64]; @@ -78,7 +72,10 @@ hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) { assert(a_str.compare(0, 3, "gfx", 3) == 0); a_str.erase(0,3); - *chip_id = std::stoi(a_str); + + // Load chip_id accounting for stepping and minor in hex and major in dec. + *chip_id = std::stoi(a_str.substr(a_str.length() - 2), nullptr, 16); + *chip_id += (std::stoi(a_str.substr(0, a_str.length() - 2)) << 8); return HSA_STATUS_SUCCESS; } @@ -163,6 +160,7 @@ uint32_t DevIDToAddrLibFamily(uint32_t dev_id) { case 4: // Vega12 case 6: // Vega20 case 8: // Arcturus + case 10: // Aldebaran return FAMILY_AI; case 2: diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h index adcdec490..698286979 100644 --- a/src/inc/amd_hsa_elf.h +++ b/src/inc/amd_hsa_elf.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -115,10 +115,15 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3D = 0x03d, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3E = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X41 = 0x041, // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/src/inc/hsa.h b/src/inc/hsa.h index d8fdd472b..fe1facca4 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -480,7 +480,21 @@ typedef enum { /** * String containing the ROCr build identifier. */ - HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200 + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202 } hsa_system_info_t; /** diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h index bf3e9197e..35dd21bfa 100644 --- a/src/inc/hsa_api_trace.h +++ b/src/inc/hsa_api_trace.h @@ -183,6 +183,9 @@ struct AmdExtTable { decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; + decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; + decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; + decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; }; // Table to export HSA Core Runtime Apis diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 9df7c49ae..a0bc5d1d9 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -2116,6 +2116,162 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, hsa_amd_deallocation_callback_t callback); +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/src/inc/hsa_ven_amd_loader.h b/src/inc/hsa_ven_amd_loader.h index 3ce847555..47236c86e 100644 --- a/src/inc/hsa_ven_amd_loader.h +++ b/src/inc/hsa_ven_amd_loader.h @@ -493,10 +493,44 @@ hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( //===----------------------------------------------------------------------===// +/** + * @brief Iterate over the available executables, and invoke an + * application-defined callback on every iteration. While + * ::hsa_ven_amd_loader_iterate_executables is executing any calls to + * ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy will be blocked. + * + * @param[in] callback Callback to be invoked once per executable. The HSA + * runtime passes two arguments to the callback: the executable and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_ven_amd_loader_iterate_executables returns that status value. If + * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy then the behavior is undefined. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +//===----------------------------------------------------------------------===// + /** * @brief Extension version. */ -#define hsa_ven_amd_loader 001002 +#define hsa_ven_amd_loader 001003 /** * @brief Extension function table version 1.00. @@ -582,6 +616,50 @@ typedef struct hsa_ven_amd_loader_1_02_pfn_s { hsa_code_object_reader_t *code_object_reader); } hsa_ven_amd_loader_1_02_pfn_t; +/** + * @brief Extension function table version 1.03. + */ +typedef struct hsa_ven_amd_loader_1_03_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + + hsa_status_t + (*hsa_ven_amd_loader_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); +} hsa_ven_amd_loader_1_03_pfn_t; + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index cb7964422..3b21b69c4 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -568,6 +568,7 @@ namespace code { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: name = "gfx906"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: name = "gfx908"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: name = "gfx909"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: name = "gfx90a"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: name = "gfx90c"; xnack_supported = true; sramecc_supported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true; sramecc_supported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true; sramecc_supported = false; break; @@ -622,9 +623,11 @@ namespace code { mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; else if (old_name == "AMD:AMDGPU:9:0:6" || old_name == "AMD:AMDGPU:9:0:7") mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; + else if (old_name == "AMD:AMDGPU:9:0:12") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; else { - // Code object v2 only supports asics up to gfx906. Do NOT add handling - // of new asics into this if-else-if* block. + // Code object v2 only supports asics up to gfx906 plus gfx90c. Do NOT + // add handling of new asics into this if-else-if* block. return ""; } std::string name; diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp deleted file mode 100644 index a36ce2c95..000000000 --- a/src/loader/loaders.cpp +++ /dev/null @@ -1,281 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#include -#include -#include "loaders.hpp" - -namespace rocr { -namespace amd { -namespace hsa { -namespace loader { - - // Helper function that allocates an aligned memory. - static inline void* - alignedMalloc(size_t size, size_t alignment) - { - #if defined(_WIN32) - return ::_aligned_malloc(size, alignment); - #else - void * ptr = NULL; - alignment = (std::max)(alignment, sizeof(void*)); - if (0 == ::posix_memalign(&ptr, alignment, size)) { - return ptr; - } - return NULL; - #endif - } - - // Helper function that frees an aligned memory. - static inline void - alignedFree(void *ptr) - { - #if defined(_WIN32) - ::_aligned_free(ptr); - #else - free(ptr); - #endif - } - - OfflineLoaderContext::OfflineLoaderContext() - : out(std::cout) - { - invalid.handle = 0; - gfx700.handle = 700; - gfx701.handle = 701; - gfx702.handle = 702; - gfx801.handle = 801; - gfx802.handle = 802; - gfx803.handle = 803; - gfx805.handle = 805; - gfx810.handle = 810; - gfx900.handle = 900; - gfx902.handle = 902; - gfx904.handle = 904; - gfx906.handle = 906; - gfx908.handle = 908; - gfx1010.handle = 1010; - gfx1011.handle = 1011; - gfx1012.handle = 1012; - gfx1030.handle = 1030; - gfx1031.handle = 1031; - gfx1032.handle = 1032; - gfx1033.handle = 1033; - } - - hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) - { - std::string sname(name); - if (sname == "AMD:AMDGPU:7:0:0") { - return gfx700; - } else if (sname == "AMD:AMDGPU:7:0:1") { - return gfx701; - } else if (sname == "AMD:AMDGPU:7:0:2") { - return gfx702; - } else if (sname == "AMD:AMDGPU:7:0:3") { - return gfx703; - } else if (sname == "AMD:AMDGPU:7:0:4") { - return gfx704; - } else if (sname == "AMD:AMDGPU:7:0:5") { - return gfx705; - } else if (sname == "AMD:AMDGPU:8:0:1") { - return gfx801; - } else if (sname == "AMD:AMDGPU:8:0:0" || sname == "AMD:AMDGPU:8:0:2") { - return gfx802; - } else if (sname == "AMD:AMDGPU:8:0:3" || sname == "AMD:AMDGPU:8:0:4") { - return gfx803; - } else if (sname == "AMD:AMDGPU:8:0:5") { - return gfx805; - } else if (sname == "AMD:AMDGPU:8:1:0") { - return gfx810; - } else if (sname == "AMD:AMDGPU:9:0:0" || sname == "AMD:AMDGPU:9:0:1") { - return gfx900; - } else if (sname == "AMD:AMDGPU:9:0:2" || sname == "AMD:AMDGPU:9:0:3") { - return gfx902; - } else if (sname == "AMD:AMDGPU:9:0:4" || sname == "AMD:AMDGPU:9:0:5") { - return gfx904; - } else if (sname == "AMD:AMDGPU:9:0:6" || sname == "AMD:AMDGPU:9:0:7") { - return gfx906; - } else if (sname == "AMD:AMDGPU:9:0:8") { - return gfx908; - } else if (sname == "AMD:AMDGPU:10:1:0") { - return gfx1010; - } else if (sname == "AMD:AMDGPU:10:1:1") { - return gfx1011; - } else if (sname == "AMD:AMDGPU:10:1:2") { - return gfx1012; - } else if (sname == "AMD:AMDGPU:10:3:0") { - return gfx1030; - } else if (sname == "AMD:AMDGPU:10:3:1") { - return gfx1031; - } else if (sname == "AMD:AMDGPU:10:3:2") { - return gfx1032; - } else if (sname == "AMD:AMDGPU:10:3:3") { - return gfx1033; - } - - // The offline loader only supports code object v2 which only supports - // asics up to gfx906. Do NOT add handling of new asics into this - // if-else-if* block. - assert(0); - return invalid; - } - - bool OfflineLoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) - { - return true; - } - - void* OfflineLoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) - { - void* ptr = alignedMalloc(size, align); - if (zero) { memset(ptr, 0, size); } - out << "SegmentAlloc: " << segment << ": " << "size=" << size << " align=" << align << " zero=" << zero << " result=" << ptr << std::endl; - pointers.insert(ptr); - return ptr; - } - - bool OfflineLoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) - { - out << "SegmentCopy: " << segment << ": " << "dst=" << dst << " offset=" << offset << " src=" << src << " size=" << size << std::endl; - if (!dst || !src || dst == src) { - return false; - } - if (0 == size) { - return true; - } - memcpy((char *) dst + offset, src, size); - return true; - } - - void OfflineLoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) - { - out << "SegmentFree: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; - pointers.erase(seg); - alignedFree(seg); - } - - void* OfflineLoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) - { - out << "SegmentAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; - return (char*) seg + offset; - } - - void* OfflineLoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) - { - out << "SegmentHostAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; - return (char*) seg + offset; - } - - bool OfflineLoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) - { - out << "SegmentFreeze: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; - return true; - } - - bool OfflineLoaderContext::ImageExtensionSupported() - { - return true; - } - - hsa_status_t OfflineLoaderContext::ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle) - { - void* ptr = alignedMalloc(256, 8); - out << "ImageCreate" << ":" << - " permission=" << image_permission << - " geometry=" << image_descriptor->geometry << - " width=" << image_descriptor->width << - " height=" << image_descriptor->height << - " depth=" << image_descriptor->depth << - " array_size=" << image_descriptor->array_size << - " channel_type=" << image_descriptor->format.channel_type << - " channel_order=" << image_descriptor->format.channel_order<< - " data=" << image_data << - std::endl; - pointers.insert(ptr); - image_handle->handle = reinterpret_cast(ptr); - return HSA_STATUS_SUCCESS; - } - - hsa_status_t OfflineLoaderContext::ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle) - { - void* ptr = reinterpret_cast(image_handle.handle); - pointers.erase(ptr); - alignedFree(ptr); - return HSA_STATUS_SUCCESS; - } - - hsa_status_t OfflineLoaderContext::SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) - { - void* ptr = alignedMalloc(256, 8); - out << "SamplerCreate" << ":" << - " coordinate_mode=" << sampler_descriptor->coordinate_mode << - " filter_mode=" << sampler_descriptor->filter_mode << - " address_mode=" << sampler_descriptor->address_mode << - std::endl; - pointers.insert(ptr); - sampler_handle->handle = reinterpret_cast(ptr); - return HSA_STATUS_SUCCESS; - } - - hsa_status_t OfflineLoaderContext::SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) - { - void* ptr = reinterpret_cast(sampler_handle.handle); - pointers.erase(ptr); - alignedFree(ptr); - return HSA_STATUS_SUCCESS; - } - -} // namespace loader -} // namespace hsa -} // namespace amd -} // namespace rocr diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp deleted file mode 100644 index ef6ef2a59..000000000 --- a/src/loader/loaders.hpp +++ /dev/null @@ -1,110 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LOADERS_HPP_ -#define LOADERS_HPP_ - -#include "core/inc/amd_hsa_loader.hpp" -#include -#include - -namespace rocr { -namespace amd { -namespace hsa { -namespace loader { - - class OfflineLoaderContext : public amd::hsa::loader::Context { - private: - hsa_isa_t invalid; - hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705; - hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810; - hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908; - hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033; - std::ostream& out; - typedef std::set PointerSet; - PointerSet pointers; - - public: - OfflineLoaderContext(); - - hsa_isa_t IsaFromName(const char *name) override; - - bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; - - void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override; - - bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override; - - void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override; - - void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; - - void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; - - bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; - - bool ImageExtensionSupported() override; - - hsa_status_t ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle) override; - - hsa_status_t ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle) override; - - hsa_status_t SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) override; - - hsa_status_t SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; - }; -} // namespace loader -} // namespace hsa -} // namespace amd -} // namespace rocr - -#endif // LOADERS_HPP_