diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dee403688..c81b196b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -191,7 +191,6 @@ set ( SRCS core/util/lnx/os_linux.cpp core/common/shared.cpp core/common/hsa_table_interface.cpp loader/executable.cpp - loader/loaders.cpp libamdhsacode/amd_elf_image.cpp libamdhsacode/amd_hsa_code_util.cpp libamdhsacode/amd_hsa_locks.cpp @@ -386,6 +385,7 @@ endif() ## Packaging directives set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build") +set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") ## Only pack the "binary" and "dev" components, post install script will add the directory link. set (CPACK_DEB_COMPONENT_INSTALL ON) @@ -399,9 +399,9 @@ set ( CPACK_PACKAGE_CONTACT "TODO Advanced Micro Devices, Inc." ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) -## Process the install scripts to update the CPACK variables -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/post_install DEBIAN/postinst @ONLY) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pre_remove DEBIAN/prerm @ONLY) +## Process the Debian install/remove scripts to update the CPACK variables +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) if ( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}" ) @@ -452,8 +452,12 @@ set ( CPACK_RPM_PACKAGE_PROVIDES "hsa-ext-rocr-dev" ) set ( CPACK_RPM_PACKAGE_OBSOLETES "hsa-ext-rocr-dev" ) set ( CPACK_RPM_PACKAGE_CONFLICTS "hsa-ext-rocr-dev" ) -set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) -set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) +## Process the Rpm install/remove scripts to update the CPACK variables +configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) +configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) + +set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" ) +set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" ) ## Include packaging include ( CPack ) diff --git a/src/DEBIAN/post_install b/src/DEBIAN/postinst.in similarity index 92% rename from src/DEBIAN/post_install rename to src/DEBIAN/postinst.in index 1b8ce96ef..6c8a8b652 100644 --- a/src/DEBIAN/post_install +++ b/src/DEBIAN/postinst.in @@ -44,12 +44,16 @@ set -e +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build do_ldconfig() { - echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig + fi } case "$1" in - configure) + ( configure ) do_ldconfig # Workaround for CPACK directory symlink handling error. mkdir -p @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/include @@ -58,7 +62,7 @@ case "$1" in abort-upgrade|abort-remove|abort-deconfigure) echo "$1" ;; - *) + ( * ) exit 0 ;; esac diff --git a/src/DEBIAN/pre_remove b/src/DEBIAN/prerm.in similarity index 91% rename from src/DEBIAN/pre_remove rename to src/DEBIAN/prerm.in index c168ebc4f..aa212593b 100644 --- a/src/DEBIAN/pre_remove +++ b/src/DEBIAN/prerm.in @@ -44,20 +44,23 @@ set -e +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build rm_ldconfig() { - rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig + fi } case "$1" in - remove) + ( remove ) rm_ldconfig # Workaround for CPACK directory symlink handling error. rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/hsa ;; - purge) + ( purge ) ;; - *) + ( * ) exit 0 ;; esac - diff --git a/src/RPM/rpm_post b/src/RPM/post.in similarity index 90% rename from src/RPM/rpm_post rename to src/RPM/post.in index 7fcbaabda..117148233 100644 --- a/src/RPM/rpm_post +++ b/src/RPM/post.in @@ -40,4 +40,8 @@ ## ################################################################################ -echo /opt/rocm/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build +if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig +fi diff --git a/src/RPM/rpm_postun b/src/RPM/postun.in similarity index 91% rename from src/RPM/rpm_postun rename to src/RPM/postun.in index 494844747..801525d68 100644 --- a/src/RPM/rpm_postun +++ b/src/RPM/postun.in @@ -40,6 +40,8 @@ ## ################################################################################ -if [ $1 -eq 0 ]; then - rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig +# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build +if [ $1 -eq 0 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then + rm -f /etc/ld.so.conf.d/hsa-rocr-dev.conf + ldconfig fi diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index e48f24c54..0ff7b5f8b 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -1151,6 +1151,27 @@ hsa_status_t HSA_API hsa_amd_signal_value_pointer(hsa_signal_t signal, return amdExtTable->hsa_amd_signal_value_pointer_fn(signal, value_ptr); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + return amdExtTable->hsa_amd_svm_attributes_set_fn(ptr, size, attribute_list, attribute_count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + return amdExtTable->hsa_amd_svm_attributes_get_fn(ptr, size, attribute_list, attribute_count); +} + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + return amdExtTable->hsa_amd_svm_prefetch_async_fn(ptr, size, agent, num_dep_signals, dep_signals, completion_signal); +} + // Tools only table interfaces. namespace rocr { diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index c00005ff0..df1f4f2b9 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -173,7 +173,8 @@ class GpuAgent : public GpuAgentInt { // @param [in] node Node id. Each CPU in different socket will get distinct // id. // @param [in] node_props Node property. - GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props); + // @param [in] xnack_mode XNACK mode of device. + GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode); // @brief GPU agent destructor. ~GpuAgent(); diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h index 68cf52d74..0cf527baf 100644 --- a/src/core/inc/amd_gpu_shaders.h +++ b/src/core/inc/amd_gpu_shaders.h @@ -439,6 +439,32 @@ static const unsigned int kCodeTrapHandler9[] = { 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; +static const unsigned int kCodeTrapHandler90a[] = { + 0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73, + 0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900, + 0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, + 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff, + 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, + 0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f, + 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f, + 0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f, + 0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, + 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, + 0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, + 0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4, + 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e, + 0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, + 0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f, + 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, + 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, + 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, + 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, + 0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, +}; + static const unsigned int kCodeCopyAligned8[] = { 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, 0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050, diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h index e119a939e..cce01e658 100644 --- a/src/core/inc/amd_memory_region.h +++ b/src/core/inc/amd_memory_region.h @@ -95,7 +95,7 @@ class MemoryRegion : public core::MemoryRegion { /// @brief Unpin memory. static void MakeKfdMemoryUnresident(const void* ptr); - MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props); ~MemoryRegion(); diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h index 510e36960..9954b8fc7 100644 --- a/src/core/inc/hsa_ext_amd_impl.h +++ b/src/core/inc/hsa_ext_amd_impl.h @@ -246,6 +246,21 @@ hsa_status_t hsa_amd_deregister_deallocation_callback( hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, volatile hsa_signal_value_t** value_ptr); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + } // namespace amd } // namespace rocr diff --git a/src/core/inc/hsa_ven_amd_loader_impl.h b/src/core/inc/hsa_ven_amd_loader_impl.h index 51ba4289c..82167253a 100644 --- a/src/core/inc/hsa_ven_amd_loader_impl.h +++ b/src/core/inc/hsa_ven_amd_loader_impl.h @@ -78,6 +78,12 @@ namespace rocr { size_t size, hsa_code_object_reader_t *code_object_reader); + hsa_status_t + hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); } // namespace rocr #endif diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h index 583b13a12..4f362fad2 100644 --- a/src/core/inc/memory_region.h +++ b/src/core/inc/memory_region.h @@ -57,8 +57,8 @@ class Agent; class MemoryRegion : public Checked<0x9C961F19EE175BB3> { public: - MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner) - : fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) { + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner) + : fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), owner_(owner) { assert(owner_ != NULL); } @@ -112,12 +112,15 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { __forceinline bool fine_grain() const { return fine_grain_; } + __forceinline bool kernarg() const { return kernarg_; } + __forceinline bool full_profile() const { return full_profile_; } __forceinline core::Agent* owner() const { return owner_; } private: const bool fine_grain_; + const bool kernarg_; const bool full_profile_; core::Agent* owner_; diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index 634224c5f..d232e6f95 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -292,6 +292,15 @@ class Runtime { hsa_status_t IPCDetach(void* ptr); + hsa_status_t SetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + + hsa_status_t GetSvmAttrib(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + + hsa_status_t SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, hsa_signal_t completion_signal); + const std::vector& cpu_agents() { return cpu_agents_; } const std::vector& gpu_agents() { return gpu_agents_; } @@ -395,6 +404,28 @@ class Runtime { std::vector arg_; }; + struct PrefetchRange; + typedef std::map prefetch_map_t; + + struct PrefetchOp { + void* base; + size_t size; + uint32_t node_id; + int remaining_deps; + hsa_signal_t completion; + std::vector dep_signals; + prefetch_map_t::iterator prefetch_map_entry; + }; + + struct PrefetchRange { + PrefetchRange() {} + PrefetchRange(size_t Bytes, PrefetchOp* Op) : bytes(Bytes), op(Op) {} + size_t bytes; + PrefetchOp* op; + prefetch_map_t::iterator prev; + prefetch_map_t::iterator next; + }; + // Will be created before any user could call hsa_init but also could be // destroyed before incorrectly written programs call hsa_shutdown. static KernelMutex bootstrap_lock_; @@ -444,6 +475,9 @@ class Runtime { /// @retval Index in ::link_matrix_. uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to); + /// @brief Get most recently issued SVM prefetch agent for the range in question. + Agent* GetSVMPrefetchAgent(void* ptr, size_t size); + // Mutex object to protect multithreaded access to ::allocation_map_, // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo // registered & mapped arrays. @@ -485,6 +519,10 @@ class Runtime { // Contains the region, address, and size of previously allocated memory. std::map allocation_map_; + // Pending prefetch containers. + KernelMutex prefetch_lock_; + prefetch_map_t prefetch_map_; + // Allocator using ::system_region_ std::function system_allocator_; diff --git a/src/core/inc/scratch_cache.h b/src/core/inc/scratch_cache.h index e85e22660..1e079babb 100644 --- a/src/core/inc/scratch_cache.h +++ b/src/core/inc/scratch_cache.h @@ -107,7 +107,7 @@ class ScratchCache { ScratchCache& operator=(const ScratchCache& rhs) = delete; ScratchCache& operator=(ScratchCache&& rhs) = delete; - ScratchCache(deallocator_t deallocator) : dealloc(deallocator) {} + ScratchCache(deallocator_t deallocator) : dealloc(deallocator), available_bytes(0) {} ~ScratchCache() { assert(map.empty() && "ScratchCache not empty at shutdown."); } @@ -122,6 +122,7 @@ class ScratchCache { it->second.alloc(); info.queue_base = it->second.base; info.scratch_node = it; + available_bytes -= it->first; return true; } it++; @@ -136,6 +137,7 @@ class ScratchCache { info.queue_base = it->second.base; info.size = it->first; info.scratch_node = it; + available_bytes -= it->first; return true; } it++; @@ -152,6 +154,8 @@ class ScratchCache { return; } it->second.free(); + available_bytes += it->first; + assert(it->first == info.size && "Scratch cache size mismatch."); } bool trim(bool trim_nodes_in_use) { @@ -159,6 +163,7 @@ class ScratchCache { auto it = map.begin(); while (it != map.end()) { if (it->second.isFree()) { + available_bytes -= it->first; dealloc(it->second.base, it->first, it->second.large); auto temp = it; it++; @@ -181,9 +186,14 @@ class ScratchCache { info.scratch_node = it; } + size_t free_bytes() const { + return available_bytes; + } + private: map_t map; deallocator_t dealloc; + size_t available_bytes; }; } // namespace AMD diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp index 576f66369..ddbfc1a9d 100644 --- a/src/core/runtime/amd_cpu_agent.cpp +++ b/src/core/runtime/amd_cpu_agent.cpp @@ -69,40 +69,31 @@ void CpuAgent::InitRegionList() { std::vector mem_props(properties_.NumMemoryBanks); if (HSAKMT_STATUS_SUCCESS == - hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, - &mem_props[0])) { + hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, &mem_props[0])) { std::vector::iterator system_prop = std::find_if(mem_props.begin(), mem_props.end(), [](HsaMemoryProperties prop) -> bool { return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM); }); - if (system_prop != mem_props.end()) { - MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, *system_prop); + HsaMemoryProperties system_props; + std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); + system_props.HeapType = HSA_HEAPTYPE_SYSTEM; + system_props.SizeInBytes = 0; + system_props.VirtualBaseAddress = 0; - regions_.push_back(system_region_fine); + if (system_prop != mem_props.end()) system_props = *system_prop; - if (!is_apu_node) { - MemoryRegion* system_region_coarse = - new MemoryRegion(false, is_apu_node, this, *system_prop); + MemoryRegion* system_region_fine = + new MemoryRegion(true, false, is_apu_node, this, system_props); + regions_.push_back(system_region_fine); + MemoryRegion* system_region_kernarg = + new MemoryRegion(true, true, is_apu_node, this, system_props); + regions_.push_back(system_region_kernarg); - regions_.push_back(system_region_coarse); - } - } else { - HsaMemoryProperties system_props; - std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); - - system_props.HeapType = HSA_HEAPTYPE_SYSTEM; - system_props.SizeInBytes = 0; - system_props.VirtualBaseAddress = 0; - - MemoryRegion* system_region_fine = new MemoryRegion(true, is_apu_node, this, system_props); - regions_.push_back(system_region_fine); - - if (!is_apu_node) { - MemoryRegion* system_region_coarse = - new MemoryRegion(false, is_apu_node, this, system_props); - regions_.push_back(system_region_coarse); - } + if (!is_apu_node) { + MemoryRegion* system_region_coarse = + new MemoryRegion(false, false, is_apu_node, this, system_props); + regions_.push_back(system_region_coarse); } } } diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 06265bda2..eb1e010cb 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -77,7 +77,7 @@ extern HsaApiTable hsa_internal_api_table_; } // namespace core namespace AMD { -GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) +GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode) : GpuAgentInt(node), properties_(node_props), current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), @@ -112,16 +112,15 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported; if (isa_base->IsSrameccSupported()) { - sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 - ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; } rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported; if (isa_base->IsXnackSupported()) { // TODO: This needs to be obtained form KFD once HMM implemented. - xnack = profile_ == HSA_PROFILE_FULL ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + xnack = xnack_mode ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; } // Set instruction set architecture via node property, only on GPU device. @@ -202,6 +201,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_7; ASICShader compute_8; ASICShader compute_9; + ASICShader compute_90a; ASICShader compute_1010; ASICShader compute_10; }; @@ -212,6 +212,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {NULL, 0, 0, 0}, {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, + {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, }}, @@ -220,6 +221,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, }}, @@ -228,6 +230,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, }}, @@ -236,6 +239,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar {kCodeFill7, sizeof(kCodeFill7), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill8, sizeof(kCodeFill8), 19, 8}, + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, {kCodeFill10, sizeof(kCodeFill10), 19, 8}, }}}; @@ -254,6 +258,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar asic_shader = &compiled_shader_it->second.compute_8; break; case 9: + if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) + asic_shader = &compiled_shader_it->second.compute_90a; + else asic_shader = &compiled_shader_it->second.compute_9; break; case 10: @@ -302,6 +309,14 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2); AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1); + + if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) && + (isa_->GetStepping() == 10)) { + // Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a. + // FIXME: Assemble code objects from source at build time + int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1; + header->max_scratch_backing_memory_byte_size = uint64_t(gran_accvgprs) << 32; + } } // Copy shader code into the GPU-visible buffer. @@ -338,8 +353,7 @@ void GpuAgent::InitRegionList() { memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax; case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { - MemoryRegion* region = - new MemoryRegion(false, false, this, mem_props[mem_idx]); + MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]); regions_.push_back(region); @@ -348,7 +362,7 @@ void GpuAgent::InitRegionList() { // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. if ((properties_.HiveID != 0) || (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) { - regions_.push_back(new MemoryRegion(true, false, this, mem_props[mem_idx])); + regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx])); } } break; @@ -816,12 +830,16 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AGENT_INFO_DEVICE: *((hsa_device_type_t*)value) = HSA_DEVICE_TYPE_GPU; break; - case HSA_AGENT_INFO_CACHE_SIZE: + case HSA_AGENT_INFO_CACHE_SIZE: { std::memset(value, 0, sizeof(uint32_t) * 4); - // TODO: no GPU cache info from KFD. Hardcode for now. - // GCN whitepaper: L1 data cache is 16KB. - ((uint32_t*)value)[0] = 16 * 1024; - break; + assert(cache_props_.size() > 0 && "GPU cache info missing."); + const size_t num_cache = cache_props_.size(); + for (size_t i = 0; i < num_cache; ++i) { + const uint32_t line_level = cache_props_[i].CacheLevel; + if (reinterpret_cast(value)[line_level - 1] == 0) + reinterpret_cast(value)[line_level - 1] = cache_props_[i].CacheSize * 1024; + } + } break; case HSA_AGENT_INFO_ISA: *((hsa_isa_t*)value) = core::Isa::Handle(isa_); break; @@ -1089,12 +1107,13 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { Limit total bound small scratch allocations to 1/8th of scratch pool and 1/4 of that for a single allocation. */ + ScopedAcquire lock(&scratch_lock_); size_t small_limit = scratch_pool_.size() >> 3; // Lift limit for 2.10 release RCCL workaround. size_t single_limit = 146800640; //small_limit >> 2; bool use_reclaim = true; bool large = (scratch.size > single_limit) || - (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); + (scratch_pool_.size() - scratch_pool_.remaining() - scratch_cache_.free_bytes() + scratch.size > small_limit); if ((isa_->GetMajorVersion() < 8) || core::Runtime::runtime_singleton_->flag().no_scratch_reclaim()) { large = false; @@ -1122,8 +1141,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { // Lambda called in place. // Used to allow exit from nested loops. [&]() { - ScopedAcquire lock(&scratch_lock_); // Check scratch cache + scratch.large = large; if (scratch_cache_.alloc(scratch)) return; // Attempt new allocation. diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index fca439866..4419286fd 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -100,9 +100,9 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } -MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, +MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props) - : core::MemoryRegion(fine_grain, full_profile, owner), + : core::MemoryRegion(fine_grain, kernarg, full_profile, owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0), @@ -128,6 +128,8 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; + if (kernarg) mem_flag_.ui32.Uncached = 1; + virtual_size_ = (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } @@ -290,16 +292,14 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_GLOBAL_FLAGS: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: - *((uint32_t*)value) = fine_grain() - ? (HSA_REGION_GLOBAL_FLAG_KERNARG | - HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; - break; - case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: - *((uint32_t*)value) = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: { + uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED + : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG; + *((uint32_t*)value) = ret; break; + } default: *((uint32_t*)value) = 0; break; diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp index 185838489..71bc73821 100644 --- a/src/core/runtime/amd_topology.cpp +++ b/src/core/runtime/amd_topology.cpp @@ -68,6 +68,38 @@ namespace AMD { static const uint kKfdVersionMajor = 0; static const uint kKfdVersionMinor = 99; +// Query for user preference and use that to determine Xnack mode of ROCm system. +// Return true if Xnack mode is ON or false if OFF. Xnack mode of a system is +// orthogonal to devices that do not support Xnack mode. It is legal for a +// system with Xnack ON to have devices that do not support Xnack functionality. +bool BindXnackMode() { + // Get users' preference for Xnack mode of ROCm platform + HSAint32 mode; + mode = core::Runtime::runtime_singleton_->flag().xnack(); + bool config_xnack = + (core::Runtime::runtime_singleton_->flag().xnack() != Flag::XNACK_REQUEST::XNACK_UNCHANGED); + + // Indicate to driver users' preference for Xnack mode + // Call to driver can fail and is a supported feature + HSAKMT_STATUS status = HSAKMT_STATUS_ERROR; + if (config_xnack) { + status = hsaKmtSetXNACKMode(mode); + if (status == HSAKMT_STATUS_SUCCESS) { + return mode; + } + } + + // Get Xnack mode of devices bound by driver. This could happen + // when a call to SET Xnack mode fails or user has no particular + // preference + status = hsaKmtGetXNACKMode((HSAint32*)&mode); + if(status != HSAKMT_STATUS_SUCCESS) { + debug_print("KFD does not support xnack mode query.\nROCr must assume xnack is disabled.\n"); + return false; + } + return mode; +} + CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (node_prop.NumCPUCores == 0) { return nullptr; @@ -79,14 +111,14 @@ CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { return cpu; } -GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { +GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop, bool xnack_mode) { GpuAgent* gpu = nullptr; if (node_prop.NumFComputeCores == 0) { // Ignore non GPUs. return nullptr; } try { - gpu = new GpuAgent(node_id, node_prop); + gpu = new GpuAgent(node_id, node_prop, xnack_mode); const HsaVersionInfo& kfd_version = core::Runtime::runtime_singleton_->KfdVersion(); @@ -111,7 +143,7 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (gpu->isa()->GetProcessorName() == "gfx908") { node_prop.Capability.ui32.SRAM_EDCSupport = 1; delete gpu; - gpu = new GpuAgent(node_id, node_prop); + gpu = new GpuAgent(node_id, node_prop, xnack_mode); } } } catch (const hsa_exception& e) { @@ -174,20 +206,28 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { link_info.atomic_support_32bit = true; link_info.atomic_support_64bit = true; link_info.coherent_support = true; + if (core::Runtime::runtime_singleton_->flag().patch_xgmi_link_weight()) { + if (io_link.Weight == 0) { + io_link.Weight = 15; + } + } break; default: debug_print("Unrecognized IOLINK type.\n"); break; } - if (io_link.Flags.ui32.Override == 1) { - if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { - // Ignore this link since peer to peer is not allowed. - continue; + // KFD is reporting wrong override status for XGMI. Disallow override for bringup. + if (!core::Runtime::runtime_singleton_->flag().patch_link_override()) { + if (io_link.Flags.ui32.Override == 1) { + if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { + // Ignore this link since peer to peer is not allowed. + continue; + } + link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); + link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); + link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); } - link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); - link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); - link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); } link_info.max_bandwidth = io_link.MaximumBandwidth; @@ -204,7 +244,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { /** * Process the list of Gpus that are surfaced to user */ -static void SurfaceGpuList(std::vector& gpu_list) { +static void SurfaceGpuList(std::vector& gpu_list, bool xnack_mode) { // Process user visible Gpu devices int32_t invalidIdx = -1; int32_t list_sz = gpu_list.size(); @@ -221,7 +261,7 @@ static void SurfaceGpuList(std::vector& gpu_list) { // Instantiate a Gpu device. The IO links // of this node have already been registered assert((node_prop.NumFComputeCores != 0) && "Improper node used for GPU device discovery."); - DiscoverGpu(gpu_list[idx], node_prop); + DiscoverGpu(gpu_list[idx], node_prop, xnack_mode); } } @@ -305,8 +345,11 @@ void BuildTopology() { RegisterLinkInfo(node_id, node_prop.NumIOLinks); } + // Determine the Xnack mode to be bound for system + bool xnack_mode = BindXnackMode(); + // Instantiate ROCr objects to encapsulate Gpu devices - SurfaceGpuList(gpu_usr_list); + SurfaceGpuList(gpu_usr_list, xnack_mode); } bool Load() { diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index e9f72adb3..f496f2ca9 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -341,6 +341,8 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin {"hsa_ext_finalizer_1_00_pfn_t", sizeof(hsa_ext_finalizer_1_00_pfn_t)}, {"hsa_ven_amd_loader_1_00_pfn_t", sizeof(hsa_ven_amd_loader_1_00_pfn_t)}, {"hsa_ven_amd_loader_1_01_pfn_t", sizeof(hsa_ven_amd_loader_1_01_pfn_t)}, + {"hsa_ven_amd_loader_1_02_pfn_t", sizeof(hsa_ven_amd_loader_1_02_pfn_t)}, + {"hsa_ven_amd_loader_1_03_pfn_t", sizeof(hsa_ven_amd_loader_1_03_pfn_t)}, {"hsa_ven_amd_aqlprofile_1_00_pfn_t", sizeof(hsa_ven_amd_aqlprofile_1_00_pfn_t)}}; static const size_t num_tables = sizeof(sizes) / sizeof(sizes_t); @@ -448,15 +450,21 @@ hsa_status_t hsa_system_get_major_extension_table(uint16_t extension, uint16_t v if (extension == HSA_EXTENSION_AMD_LOADER) { if (version_major != 1) return HSA_STATUS_ERROR; - hsa_ven_amd_loader_1_01_pfn_t ext_table; - ext_table.hsa_ven_amd_loader_query_host_address = hsa_ven_amd_loader_query_host_address; + hsa_ven_amd_loader_1_03_pfn_t ext_table; + ext_table.hsa_ven_amd_loader_query_host_address = + hsa_ven_amd_loader_query_host_address; ext_table.hsa_ven_amd_loader_query_segment_descriptors = hsa_ven_amd_loader_query_segment_descriptors; - ext_table.hsa_ven_amd_loader_query_executable = hsa_ven_amd_loader_query_executable; + ext_table.hsa_ven_amd_loader_query_executable = + hsa_ven_amd_loader_query_executable; ext_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects = hsa_ven_amd_loader_executable_iterate_loaded_code_objects; ext_table.hsa_ven_amd_loader_loaded_code_object_get_info = hsa_ven_amd_loader_loaded_code_object_get_info; + ext_table.hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size = + hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size; + ext_table.hsa_ven_amd_loader_iterate_executables = + hsa_ven_amd_loader_iterate_executables; memcpy(table, &ext_table, Min(sizeof(ext_table), table_length)); @@ -1547,7 +1555,7 @@ hsa_status_t hsa_agent_iterate_isas( const Isa *isa_object = agent_object->isa(); if (!isa_object) { - return HSA_STATUS_ERROR_INVALID_AGENT; + return HSA_STATUS_SUCCESS; } return callback(Isa::Handle(isa_object), data); @@ -1894,9 +1902,13 @@ static std::string ConvertOldTargetNameToNew( NewName = "amdgcn-amd-amdhsa--gfx906"; xnack_supported = true; } + else if (OldName == "AMD:AMDGPU:9:0:12") { + NewName = "amdgcn-amd-amdhsa--gfx90c"; + xnack_supported = true; + } else { - // Code object v2 only supports asics up to gfx906. Do NOT add handling - // of new asics into this if-else-if* block. + // Code object v2 only supports asics up to gfx906 plus gfx90c. Do NOT add + // handling of new asics into this if-else-if* block. return ""; } diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp index 9e50971b3..605ec15ae 100644 --- a/src/core/runtime/hsa_api_trace.cpp +++ b/src/core/runtime/hsa_api_trace.cpp @@ -391,6 +391,9 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_register_deallocation_callback_fn = AMD::hsa_amd_register_deallocation_callback; amd_ext_api.hsa_amd_deregister_deallocation_callback_fn = AMD::hsa_amd_deregister_deallocation_callback; amd_ext_api.hsa_amd_signal_value_pointer_fn = AMD::hsa_amd_signal_value_pointer; + amd_ext_api.hsa_amd_svm_attributes_set_fn = AMD::hsa_amd_svm_attributes_set; + amd_ext_api.hsa_amd_svm_attributes_get_fn = AMD::hsa_amd_svm_attributes_get; + amd_ext_api.hsa_amd_svm_prefetch_async_fn = AMD::hsa_amd_svm_prefetch_async; } void LoadInitialHsaApiTable() { diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index f4776eb99..66d7ff727 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -1009,5 +1009,37 @@ hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifie CATCH; } +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + TRY; + IS_OPEN(); + return core::Runtime::runtime_singleton_->SetSvmAttrib(ptr, size, attribute_list, + attribute_count); + CATCH; +} + +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + TRY; + IS_OPEN(); + return core::Runtime::runtime_singleton_->GetSvmAttrib(ptr, size, attribute_list, + attribute_count); + CATCH; +} + +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + TRY; + IS_OPEN(); + // Validate inputs. + // if (core::g_use_interrupt_wait && (!core::InterruptSignal::IsType(signal))) + return core::Runtime::runtime_singleton_->SvmPrefetch(ptr, size, agent, num_dep_signals, + dep_signals, completion_signal); + CATCH; +} + } // namespace amd } // namespace rocr diff --git a/src/core/runtime/hsa_ven_amd_loader.cpp b/src/core/runtime/hsa_ven_amd_loader.cpp index a74181d3b..c94563715 100644 --- a/src/core/runtime/hsa_ven_amd_loader.cpp +++ b/src/core/runtime/hsa_ven_amd_loader.cpp @@ -53,6 +53,7 @@ using namespace core; using loader::CodeObjectReaderImpl; using loader::Executable; using loader::LoadedCodeObject; +using loader::Loader; namespace AMD { @@ -273,4 +274,30 @@ hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( } catch(...) { return AMD::handleException(); } } +namespace { + +Loader *GetLoader() { + return Runtime::runtime_singleton_->loader(); +} + +} // namespace anonymous + +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) { + try { + if (!Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == callback) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return GetLoader()->IterateExecutables(callback, data); + } catch(...) { return AMD::handleException(); } +} + } // namespace rocr diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 8b889bb5d..b49dd65d9 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -281,6 +281,21 @@ constexpr size_t hsa_name_size = 63; ISAREG_ENTRY_GEN("gfx908:sramecc-:xnack+", 9, 0, 8, disabled, enabled) ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack-", 9, 0, 8, enabled, disabled) ISAREG_ENTRY_GEN("gfx908:sramecc+:xnack+", 9, 0, 8, enabled, enabled) + ISAREG_ENTRY_GEN("gfx909", 9, 0, 9, unsupported, any) + ISAREG_ENTRY_GEN("gfx909:xnack-", 9, 0, 9, unsupported, disabled) + ISAREG_ENTRY_GEN("gfx909:xnack+", 9, 0, 9, unsupported, enabled) + ISAREG_ENTRY_GEN("gfx90a", 9, 0, 10, any, any) + ISAREG_ENTRY_GEN("gfx90a:xnack-", 9, 0, 10, any, disabled) + ISAREG_ENTRY_GEN("gfx90a:xnack+", 9, 0, 10, any, enabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc-", 9, 0, 10, disabled, any) + ISAREG_ENTRY_GEN("gfx90a:sramecc+", 9, 0, 10, enabled, any) + ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack-", 9, 0, 10, disabled, disabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc-:xnack+", 9, 0, 10, disabled, enabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack-", 9, 0, 10, enabled, disabled) + ISAREG_ENTRY_GEN("gfx90a:sramecc+:xnack+", 9, 0, 10, enabled, enabled) + ISAREG_ENTRY_GEN("gfx90c", 9, 0, 12, unsupported, any) + ISAREG_ENTRY_GEN("gfx90c:xnack-", 9, 0, 12, unsupported, disabled) + ISAREG_ENTRY_GEN("gfx90c:xnack+", 9, 0, 12, unsupported, enabled) ISAREG_ENTRY_GEN("gfx1010", 10, 1, 0, unsupported, any) ISAREG_ENTRY_GEN("gfx1010:xnack-", 10, 1, 0, unsupported, disabled) ISAREG_ENTRY_GEN("gfx1010:xnack+", 10, 1, 0, unsupported, enabled) diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index c3cce57a1..93ad27186 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -60,6 +60,7 @@ #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/hsa_api_trace_int.h" #include "core/util/os.h" +#include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" #define HSA_VERSION_MAJOR 1 @@ -173,16 +174,29 @@ void Runtime::RegisterAgent(Agent* agent) { if (cpu_agents_.size() == 1) { // Might need memory pooling to cover allocation that // requires less than 4096 bytes. - system_allocator_ = [this](size_t size, size_t align, MemoryRegion::AllocateFlags alloc_flags) -> void* { - assert(align <= 4096); - void* ptr = nullptr; - core::Runtime::runtime_singleton_->AllocateMemory(system_regions_fine_[0], size, alloc_flags, &ptr); - return ptr; - }; - - system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; - BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + // Default system pool must support kernarg + for (auto pool : system_regions_fine_) { + if (pool->kernarg()) { + system_allocator_ = [pool](size_t size, size_t alignment, + MemoryRegion::AllocateFlags alloc_flags) -> void* { + assert(alignment <= 4096); + void* ptr = NULL; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory(pool, size, alloc_flags, + &ptr)) + ? ptr + : NULL; + }; + + system_deallocator_ = [](void* ptr) { + core::Runtime::runtime_singleton_->FreeMemory(ptr); + }; + + BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + break; + } + } } } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) { gpu_agents_.push_back(agent); @@ -630,6 +644,22 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { *(const char**)value = STRING(ROCR_BUILD_ID); break; } + case HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: { + bool ret = true; + for (auto agent : gpu_agents_) { + AMD::GpuAgent* gpu = (AMD::GpuAgent*)agent; + ret &= (gpu->properties().Capability.ui32.SVMAPISupported == 1); + } + *(bool*)value = ret; + break; + } + case HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: { + bool ret = true; + for(auto agent : gpu_agents_) + ret &= (agent->isa()->GetXnack() == IsaFeature::Enabled); + *(bool*)value = ret; + break; + } default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -1021,15 +1051,52 @@ void Runtime::AsyncEventsLoop(void*) { if (index == 0) { hsa_signal_handle(async_events_control_.wake)->StoreRelaxed(0); } else if (index != -1) { - // No error or timout occured, process the handler + // No error or timout occured, process the handlers + // Call handler for the known satisfied signal. assert(async_events_.handler_[index] != NULL); - bool keep = - async_events_.handler_[index](value, async_events_.arg_[index]); + bool keep = async_events_.handler_[index](value, async_events_.arg_[index]); if (!keep) { hsa_signal_handle(async_events_.signal_[index])->Release(); async_events_.CopyIndex(index, async_events_.Size() - 1); async_events_.PopBack(); } + // Check remaining signals before sleeping. + for (size_t i = index; i < async_events_.Size(); i++) { + hsa_signal_handle sig(async_events_.signal_[i]); + + value = atomic::Load(&sig->signal_.value, std::memory_order_relaxed); + bool condition_met = false; + + switch (async_events_.cond_[i]) { + case HSA_SIGNAL_CONDITION_EQ: { + condition_met = (value == async_events_.value_[i]); + break; + } + case HSA_SIGNAL_CONDITION_NE: { + condition_met = (value != async_events_.value_[i]); + break; + } + case HSA_SIGNAL_CONDITION_GTE: { + condition_met = (value >= async_events_.value_[i]); + break; + } + case HSA_SIGNAL_CONDITION_LT: { + condition_met = (value < async_events_.value_[i]); + break; + } + } + + if (condition_met) { + assert(async_events_.handler_[i] != NULL); + bool keep = async_events_.handler_[i](value, async_events_.arg_[i]); + if (!keep) { + hsa_signal_handle(async_events_.signal_[i])->Release(); + async_events_.CopyIndex(i, async_events_.Size() - 1); + async_events_.PopBack(); + i--; + } + } + } } // Check for dead signals @@ -1431,8 +1498,7 @@ void Runtime::LoadTools() { if (tool != NULL) { tool_libs_.push_back(tool); - tool_init_t ld; - ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad"); + rocr::AMD::callback_t ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad"); if (ld) { if (!ld(&hsa_api_table_.hsa_api, hsa_api_table_.hsa_api.version.major_id, @@ -1443,8 +1509,8 @@ void Runtime::LoadTools() { } } - tool_wrap_t wrap; - wrap = (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent"); + rocr::AMD::callback_t wrap = + (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent"); if (wrap) { std::vector* agent_lists[2] = {&cpu_agents_, &gpu_agents_}; @@ -1461,8 +1527,7 @@ void Runtime::LoadTools() { } } - tool_add_t add; - add = (tool_add_t)os::GetExportAddress(tool, "AddAgent"); + rocr::AMD::callback_t add = (tool_add_t)os::GetExportAddress(tool, "AddAgent"); if (add) add(this); } else { @@ -1572,5 +1637,499 @@ void Runtime::InternalQueueCreateNotify(const hsa_queue_t* queue, hsa_agent_t ag internal_queue_create_notifier_(queue, agent, internal_queue_create_notifier_user_data_); } +hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + uint32_t set_attribs = 0; + std::vector agent_seen(agents_by_node_.size(), false); + + std::vector attribs; + attribs.reserve(attribute_count); + uint32_t set_flags = 0; + uint32_t clear_flags = 0; + + auto Convert = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent == nullptr) || !agent->IsValid()) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::SetSvmAttrib."); + return agent; + }; + + auto ConvertAllowNull = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent != nullptr) && (!agent->IsValid())) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::SetSvmAttrib."); + return agent; + }; + + auto ConfirmNew = [&](Agent* agent) { + if (agent_seen[agent->node_id()]) + throw AMD::hsa_exception( + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + "Multiple attributes given for the same agent in Runtime::SetSvmAttrib."); + agent_seen[agent->node_id()] = true; + }; + + auto Check = [&](uint64_t attrib) { + if (set_attribs & (1 << attrib)) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + "Attribute given multiple times in Runtime::SetSvmAttrib."); + set_attribs |= (1 << attrib); + }; + + auto kmtPair = [](uint32_t attrib, uint32_t value) { + HSA_SVM_ATTRIBUTE pair = {attrib, value}; + return pair; + }; + + for (uint32_t i = 0; i < attribute_count; i++) { + auto attrib = attribute_list[i].attribute; + auto value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: { + Check(attrib); + switch (value) { + case HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED: + set_flags |= HSA_SVM_FLAG_COHERENT; + break; + case HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED: + clear_flags |= HSA_SVM_FLAG_COHERENT; + break; + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Invalid HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG value."); + } + break; + } + case HSA_AMD_SVM_ATTRIB_READ_ONLY: { + Check(attrib); + if (value) + set_flags |= HSA_SVM_FLAG_GPU_RO; + else + clear_flags |= HSA_SVM_FLAG_GPU_RO; + break; + } + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + Check(attrib); + if (value) + set_flags |= HSA_SVM_FLAG_HIVE_LOCAL; + else + clear_flags |= HSA_SVM_FLAG_HIVE_LOCAL; + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + Check(attrib); + // Max migration size is 1GB. + if (value > 18) value = 18; + attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, value)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + Check(attrib); + Agent* agent = ConvertAllowNull(value); + if (agent == nullptr) + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, INVALID_NODEID)); + else + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, agent->node_id())); + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + set_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id())); + } + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + set_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS_IN_PLACE, agent->node_id())); + } + break; + } + case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: { + Agent* agent = Convert(value); + ConfirmNew(agent); + if (agent->device_type() == Agent::kAmdCpuDevice) { + clear_flags |= HSA_SVM_FLAG_HOST_ACCESS; + } else { + attribs.push_back(kmtPair(HSA_SVM_ATTR_NO_ACCESS, agent->node_id())); + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::SetSvmAttrib"); + } + } + + // Merge CPU access properties - grant access if any CPU needs access. + // Probably wrong. + if (set_flags & HSA_SVM_FLAG_HOST_ACCESS) clear_flags &= ~HSA_SVM_FLAG_HOST_ACCESS; + + // Add flag updates + if (clear_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_CLR_FLAGS, clear_flags)); + if (set_flags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, set_flags)); + + uint8_t* base = AlignDown((uint8_t*)ptr, 4096); + uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096); + size_t len = end - base; + HSAKMT_STATUS error = hsaKmtSVMSetAttr(base, len, attribs.size(), &attribs[0]); + if (error != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMSetAttr failed."); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::GetSvmAttrib(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count) { + std::vector attribs; + attribs.reserve(attribute_count); + + std::vector kmtIndices(attribute_count); + + bool getFlags = false; + + auto Convert = [&](uint64_t value) -> Agent* { + hsa_agent_t handle = {value}; + Agent* agent = Agent::Convert(handle); + if ((agent == nullptr) || !agent->IsValid()) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_AGENT, + "Invalid agent handle in Runtime::GetSvmAttrib."); + return agent; + }; + + auto kmtPair = [](uint32_t attrib, uint32_t value) { + HSA_SVM_ATTRIBUTE pair = {attrib, value}; + return pair; + }; + + for (uint32_t i = 0; i < attribute_count; i++) { + auto& attrib = attribute_list[i].attribute; + auto& value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: + case HSA_AMD_SVM_ATTRIB_READ_ONLY: + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + getFlags = true; + kmtIndices[i] = -1; + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_GRANULARITY, 0)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_PREFERRED_LOC, 0)); + break; + } + case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: { + value = Agent::Convert(GetSVMPrefetchAgent(ptr, size)).handle; + kmtIndices[i] = -1; + break; + } + case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: { + Agent* agent = Convert(value); + if (agent->device_type() == Agent::kAmdCpuDevice) { + getFlags = true; + kmtIndices[i] = -1; + } else { + kmtIndices[i] = attribs.size(); + attribs.push_back(kmtPair(HSA_SVM_ATTR_ACCESS, agent->node_id())); + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::SetSvmAttrib"); + } + } + + if (getFlags) attribs.push_back(kmtPair(HSA_SVM_ATTR_SET_FLAGS, 0)); + + uint8_t* base = AlignDown((uint8_t*)ptr, 4096); + uint8_t* end = AlignUp((uint8_t*)ptr + size, 4096); + size_t len = end - base; + if (attribs.size() != 0) { + HSAKMT_STATUS error = hsaKmtSVMGetAttr(base, len, attribs.size(), &attribs[0]); + if (error != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "hsaKmtSVMGetAttr failed."); + } + + for (uint32_t i = 0; i < attribute_count; i++) { + auto& attrib = attribute_list[i].attribute; + auto& value = attribute_list[i].value; + + switch (attrib) { + case HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG: { + if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_COHERENT) + value = HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED; + else + value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED; + break; + } + case HSA_AMD_SVM_ATTRIB_READ_ONLY: { + value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_GPU_RO); + break; + } + case HSA_AMD_SVM_ATTRIB_HIVE_LOCAL: { + value = (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HIVE_LOCAL); + break; + } + case HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY: { + value = attribs[kmtIndices[i]].value; + break; + } + case HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION: { + uint64_t node = attribs[kmtIndices[i]].value; + Agent* agent = nullptr; + if (node != INVALID_NODEID) agent = agents_by_node_[node][0]; + value = Agent::Convert(agent).handle; + break; + } + case HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION: { + break; + } + case HSA_AMD_SVM_ATTRIB_ACCESS_QUERY: { + if (kmtIndices[i] == -1) { + if (attribs[attribs.size() - 1].value & HSA_SVM_FLAG_HOST_ACCESS) + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + } else { + switch (attribs[kmtIndices[i]].type) { + case HSA_SVM_ATTR_ACCESS: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + break; + case HSA_SVM_ATTR_ACCESS_IN_PLACE: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE; + break; + case HSA_SVM_ATTR_NO_ACCESS: + attrib = HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS; + break; + default: + assert(false && "Bad agent accessibility from KFD."); + } + } + break; + } + default: + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Illegal or invalid attribute in Runtime::GetSvmAttrib"); + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::SvmPrefetch(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + uintptr_t base = reinterpret_cast(AlignDown(ptr, 4096)); + uintptr_t end = AlignUp(reinterpret_cast(ptr) + size, 4096); + size_t len = end - base; + + PrefetchOp* op = new PrefetchOp(); + MAKE_NAMED_SCOPE_GUARD(OpGuard, [&]() { delete op; }); + + Agent* dest = Agent::Convert(agent); + if (dest->device_type() == Agent::kAmdCpuDevice) + op->node_id = 0; + else + op->node_id = dest->node_id(); + + op->base = reinterpret_cast(base); + op->size = len; + op->completion = completion_signal; + if (num_dep_signals > 1) { + op->remaining_deps = num_dep_signals - 1; + for (int i = 0; i < num_dep_signals - 1; i++) op->dep_signals.push_back(dep_signals[i]); + } else { + op->remaining_deps = 0; + } + + { + ScopedAcquire lock(&prefetch_lock_); + // Remove all fully overlapped and trim partially overlapped ranges. + // Get iteration bounds + auto start = prefetch_map_.upper_bound(base); + if (start != prefetch_map_.begin()) start--; + auto stop = prefetch_map_.lower_bound(end); + + auto isEndNode = [&](decltype(start) node) { return node->second.next == prefetch_map_.end(); }; + auto isFirstNode = [&](decltype(start) node) { + return node->second.prev == prefetch_map_.end(); + }; + + // Trim and remove old ranges. + while (start != stop) { + uintptr_t startBase = start->first; + uintptr_t startEnd = startBase + start->second.bytes; + + auto ibase = Max(startBase, base); + auto iend = Min(startEnd, end); + // Check for overlap + if (ibase < iend) { + // Second range check + if (iend < startEnd) { + auto ret = prefetch_map_.insert( + std::make_pair(iend, PrefetchRange(startEnd - iend, start->second.op))); + assert(ret.second && "Prefetch map insert failed during range split."); + + auto it = ret.first; + it->second.prev = start; + it->second.next = start->second.next; + start->second.next = it; + if (!isEndNode(it)) it->second.next->second.prev = it; + } + + // Is the first interval of the old range valid + if (startBase < ibase) { + start->second.bytes = ibase - startBase; + } else { + if (isFirstNode(start)) { + start->second.op->prefetch_map_entry = start->second.next; + if (!isEndNode(start)) start->second.next->second.prev = prefetch_map_.end(); + } else { + start->second.prev->second.next = start->second.next; + if (!isEndNode(start)) start->second.next->second.prev = start->second.prev; + } + prefetch_map_.erase(start); + } + } + start++; + } + + // Insert new range. + auto ret = prefetch_map_.insert(std::make_pair(base, PrefetchRange(len, op))); + assert(ret.second && "Prefetch map insert failed."); + + auto it = ret.first; + op->prefetch_map_entry = it; + it->second.next = it->second.prev = prefetch_map_.end(); + } + + // Remove the prefetch's ranges from the map. + static auto removePrefetchRanges = [](PrefetchOp* op) { + ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + auto it = op->prefetch_map_entry; + while (it != Runtime::runtime_singleton_->prefetch_map_.end()) { + auto next = it->second.next; + Runtime::runtime_singleton_->prefetch_map_.erase(it); + it = next; + } + }; + + // Prefetch Signal handler for synchronization. + static hsa_amd_signal_handler signal_handler = [](hsa_signal_value_t value, void* arg) { + PrefetchOp* op = reinterpret_cast(arg); + + if (op->remaining_deps > 0) { + op->remaining_deps--; + Runtime::runtime_singleton_->SetAsyncSignalHandler( + op->dep_signals[op->remaining_deps], HSA_SIGNAL_CONDITION_EQ, 0, signal_handler, arg); + return false; + } + + HSA_SVM_ATTRIBUTE attrib; + attrib.type = HSA_SVM_ATTR_PREFETCH_LOC; + attrib.value = op->node_id; + HSAKMT_STATUS error = hsaKmtSVMSetAttr(op->base, op->size, 1, &attrib); + assert(error == HSAKMT_STATUS_SUCCESS && "KFD Prefetch failed."); + + removePrefetchRanges(op); + + if (op->completion.handle != 0) Signal::Convert(op->completion)->SubRelaxed(1); + delete op; + + return false; + }; + + auto no_dependencies = [](void* arg) { signal_handler(0, arg); }; + + MAKE_NAMED_SCOPE_GUARD(RangeGuard, [&]() { removePrefetchRanges(op); }); + + hsa_status_t err; + if (num_dep_signals == 0) + err = AMD::hsa_amd_async_function(no_dependencies, op); + else + err = SetAsyncSignalHandler(dep_signals[num_dep_signals - 1], HSA_SIGNAL_CONDITION_EQ, 0, + signal_handler, op); + if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Signal handler unable to be set."); + + RangeGuard.Dismiss(); + OpGuard.Dismiss(); + return HSA_STATUS_SUCCESS; +} + +Agent* Runtime::GetSVMPrefetchAgent(void* ptr, size_t size) { + uintptr_t base = reinterpret_cast(AlignDown(ptr, 4096)); + uintptr_t end = AlignUp(reinterpret_cast(ptr) + size, 4096); + size_t len = end - base; + + std::vector> holes; + + ScopedAcquire lock(&Runtime::runtime_singleton_->prefetch_lock_); + auto start = prefetch_map_.upper_bound(base); + if (start != prefetch_map_.begin()) start--; + auto stop = prefetch_map_.lower_bound(end); + + // KFD returns -1 for no or mixed destinations. + uint32_t prefetch_node = -2; + if (start != stop) { + prefetch_node = start->second.op->node_id; + } + + while (start != stop) { + uintptr_t startBase = start->first; + uintptr_t startEnd = startBase + start->second.bytes; + + auto ibase = Max(base, startBase); + auto iend = Min(end, startEnd); + // Check for intersection with the query + if (ibase < iend) { + // If prefetch locations are different then we report null agent. + if (prefetch_node != start->second.op->node_id) return nullptr; + + // Push leading gap to an array for checking KFD. + if (base < ibase) holes.push_back(std::make_pair(base, ibase - base)); + + // Trim query range. + base = iend; + } + start++; + } + if (base < end) holes.push_back(std::make_pair(base, end - base)); + + HSA_SVM_ATTRIBUTE attrib; + attrib.type = HSA_SVM_ATTR_PREFETCH_LOC; + for (auto& range : holes) { + HSAKMT_STATUS error = + hsaKmtSVMGetAttr(reinterpret_cast(range.first), range.second, 1, &attrib); + assert(error == HSAKMT_STATUS_SUCCESS && "KFD prefetch query failed."); + + if (attrib.value == -1) return nullptr; + if (prefetch_node == -2) prefetch_node = attrib.value; + if (prefetch_node != attrib.value) return nullptr; + } + + assert(prefetch_node != -2 && "prefetch_node was not updated."); + assert(prefetch_node != -1 && "Should have already returned."); + return agents_by_node_[prefetch_node][0]; +} + } // namespace core } // namespace rocr diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 5013dd1a8..88fbe93be 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -56,6 +56,11 @@ class Flag { public: enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT }; + // The values are meaningful and chosen to satisfy the thunk API. + enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 }; + static_assert(XNACK_DISABLE == 0, "XNACK_REQUEST enum values improperly changed."); + static_assert(XNACK_ENABLE == 1, "XNACK_REQUEST enum values improperly changed."); + explicit Flag() { Refresh(); } virtual ~Flag() {} @@ -104,6 +109,12 @@ class Flag { var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR"); disable_fragment_alloc_ = (var == "1") ? true : false; + var = os::GetEnvVar("HSA_UNPATCH_XGMI_LINK_WEIGHT"); + patch_xgmi_link_weight_ = (var == "1") ? false : true; + + var = os::GetEnvVar("HSA_UNPATCH_LINK_OVERRIDE"); + patch_link_override_ = (var == "1") ? false : true; + var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH"); enable_sdma_hdp_flush_ = (var == "0") ? false : true; @@ -130,6 +141,11 @@ class Flag { var = os::GetEnvVar("HSA_IGNORE_SRAMECC_MISREPORT"); check_sramecc_validity_ = (var == "1") ? false : true; + + // Legal values are zero "0" or one "1". Any other value will + // be interpreted as not defining the env variable. + var = os::GetEnvVar("HSA_XNACK"); + xnack_ = (var == "0") ? XNACK_DISABLE : ((var == "1") ? XNACK_ENABLE : XNACK_UNCHANGED); } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -150,6 +166,11 @@ class Flag { bool disable_fragment_alloc() const { return disable_fragment_alloc_; } + // Temporary way to control ROCr interpretation of inter-device link weight + bool patch_xgmi_link_weight() const { return patch_xgmi_link_weight_; } + + bool patch_link_override() const { return patch_link_override_; } + bool rev_copy_dir() const { return rev_copy_dir_; } bool fine_grain_pcie() const { return fine_grain_pcie_; } @@ -178,6 +199,8 @@ class Flag { bool check_sramecc_validity() const { return check_sramecc_validity_; } + XNACK_REQUEST xnack() const { return xnack_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -195,6 +218,8 @@ class Flag { bool disable_image_; bool loader_enable_mmap_uri_; bool check_sramecc_validity_; + bool patch_xgmi_link_weight_; + bool patch_link_override_; SDMA_OVERRIDE enable_sdma_; @@ -209,6 +234,9 @@ class Flag { size_t force_sdma_size_; + // Indicates user preference for Xnack state. + XNACK_REQUEST xnack_; + DISALLOW_COPY_AND_ASSIGN(Flag); }; diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp index e60036929..6c2761151 100644 --- a/src/core/util/lnx/os_linux.cpp +++ b/src/core/util/lnx/os_linux.cpp @@ -74,7 +74,7 @@ void* __stdcall ThreadTrampoline(void* arg) { void* Data = ar->entry_args; delete ar; CallMe(Data); - return NULL; + return nullptr; } // Thread container allows multiple waits and separate close (destroy). @@ -99,7 +99,16 @@ class os_thread { assert(err == 0 && "pthread_attr_setstacksize failed."); } - int err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + int cores = get_nprocs_conf(); + cpu_set_t* cpuset = CPU_ALLOC(cores); + for(int i=0; i> 8; } -uint32_t MinorVerFromDevID(uint32_t dev_id) { - return (dev_id % 100)/10; -} +uint32_t MinorVerFromDevID(uint32_t dev_id) { return (dev_id >> 4) & 0xF; } -uint32_t StepFromDevID(uint32_t dev_id) { - return (dev_id%100)%10; -} +uint32_t StepFromDevID(uint32_t dev_id) { return dev_id & 0xF; } hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) { char asic_name[64]; @@ -78,7 +72,10 @@ hsa_status_t GetGPUAsicID(hsa_agent_t agent, uint32_t *chip_id) { assert(a_str.compare(0, 3, "gfx", 3) == 0); a_str.erase(0,3); - *chip_id = std::stoi(a_str); + + // Load chip_id accounting for stepping and minor in hex and major in dec. + *chip_id = std::stoi(a_str.substr(a_str.length() - 2), nullptr, 16); + *chip_id += (std::stoi(a_str.substr(0, a_str.length() - 2)) << 8); return HSA_STATUS_SUCCESS; } @@ -163,6 +160,7 @@ uint32_t DevIDToAddrLibFamily(uint32_t dev_id) { case 4: // Vega12 case 6: // Vega20 case 8: // Arcturus + case 10: // Aldebaran return FAMILY_AI; case 2: diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h index adcdec490..698286979 100644 --- a/src/inc/amd_hsa_elf.h +++ b/src/inc/amd_hsa_elf.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -115,10 +115,15 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3D = 0x03d, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X3E = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X41 = 0x041, // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/src/inc/hsa.h b/src/inc/hsa.h index d8fdd472b..fe1facca4 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -480,7 +480,21 @@ typedef enum { /** * String containing the ROCr build identifier. */ - HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200 + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202 } hsa_system_info_t; /** diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h index bf3e9197e..35dd21bfa 100644 --- a/src/inc/hsa_api_trace.h +++ b/src/inc/hsa_api_trace.h @@ -183,6 +183,9 @@ struct AmdExtTable { decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; + decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; + decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; + decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; }; // Table to export HSA Core Runtime Apis diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 9df7c49ae..a0bc5d1d9 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -2116,6 +2116,162 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, hsa_amd_deallocation_callback_t callback); +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/src/inc/hsa_ven_amd_loader.h b/src/inc/hsa_ven_amd_loader.h index 3ce847555..47236c86e 100644 --- a/src/inc/hsa_ven_amd_loader.h +++ b/src/inc/hsa_ven_amd_loader.h @@ -493,10 +493,44 @@ hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( //===----------------------------------------------------------------------===// +/** + * @brief Iterate over the available executables, and invoke an + * application-defined callback on every iteration. While + * ::hsa_ven_amd_loader_iterate_executables is executing any calls to + * ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy will be blocked. + * + * @param[in] callback Callback to be invoked once per executable. The HSA + * runtime passes two arguments to the callback: the executable and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_ven_amd_loader_iterate_executables returns that status value. If + * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy then the behavior is undefined. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +//===----------------------------------------------------------------------===// + /** * @brief Extension version. */ -#define hsa_ven_amd_loader 001002 +#define hsa_ven_amd_loader 001003 /** * @brief Extension function table version 1.00. @@ -582,6 +616,50 @@ typedef struct hsa_ven_amd_loader_1_02_pfn_s { hsa_code_object_reader_t *code_object_reader); } hsa_ven_amd_loader_1_02_pfn_t; +/** + * @brief Extension function table version 1.03. + */ +typedef struct hsa_ven_amd_loader_1_03_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + + hsa_status_t + (*hsa_ven_amd_loader_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); +} hsa_ven_amd_loader_1_03_pfn_t; + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index cb7964422..3b21b69c4 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -568,6 +568,7 @@ namespace code { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: name = "gfx906"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: name = "gfx908"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: name = "gfx909"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: name = "gfx90a"; xnack_supported = true; sramecc_supported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: name = "gfx90c"; xnack_supported = true; sramecc_supported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true; sramecc_supported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true; sramecc_supported = false; break; @@ -622,9 +623,11 @@ namespace code { mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; else if (old_name == "AMD:AMDGPU:9:0:6" || old_name == "AMD:AMDGPU:9:0:7") mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; + else if (old_name == "AMD:AMDGPU:9:0:12") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; else { - // Code object v2 only supports asics up to gfx906. Do NOT add handling - // of new asics into this if-else-if* block. + // Code object v2 only supports asics up to gfx906 plus gfx90c. Do NOT + // add handling of new asics into this if-else-if* block. return ""; } std::string name; diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp deleted file mode 100644 index a36ce2c95..000000000 --- a/src/loader/loaders.cpp +++ /dev/null @@ -1,281 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#include -#include -#include "loaders.hpp" - -namespace rocr { -namespace amd { -namespace hsa { -namespace loader { - - // Helper function that allocates an aligned memory. - static inline void* - alignedMalloc(size_t size, size_t alignment) - { - #if defined(_WIN32) - return ::_aligned_malloc(size, alignment); - #else - void * ptr = NULL; - alignment = (std::max)(alignment, sizeof(void*)); - if (0 == ::posix_memalign(&ptr, alignment, size)) { - return ptr; - } - return NULL; - #endif - } - - // Helper function that frees an aligned memory. - static inline void - alignedFree(void *ptr) - { - #if defined(_WIN32) - ::_aligned_free(ptr); - #else - free(ptr); - #endif - } - - OfflineLoaderContext::OfflineLoaderContext() - : out(std::cout) - { - invalid.handle = 0; - gfx700.handle = 700; - gfx701.handle = 701; - gfx702.handle = 702; - gfx801.handle = 801; - gfx802.handle = 802; - gfx803.handle = 803; - gfx805.handle = 805; - gfx810.handle = 810; - gfx900.handle = 900; - gfx902.handle = 902; - gfx904.handle = 904; - gfx906.handle = 906; - gfx908.handle = 908; - gfx1010.handle = 1010; - gfx1011.handle = 1011; - gfx1012.handle = 1012; - gfx1030.handle = 1030; - gfx1031.handle = 1031; - gfx1032.handle = 1032; - gfx1033.handle = 1033; - } - - hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) - { - std::string sname(name); - if (sname == "AMD:AMDGPU:7:0:0") { - return gfx700; - } else if (sname == "AMD:AMDGPU:7:0:1") { - return gfx701; - } else if (sname == "AMD:AMDGPU:7:0:2") { - return gfx702; - } else if (sname == "AMD:AMDGPU:7:0:3") { - return gfx703; - } else if (sname == "AMD:AMDGPU:7:0:4") { - return gfx704; - } else if (sname == "AMD:AMDGPU:7:0:5") { - return gfx705; - } else if (sname == "AMD:AMDGPU:8:0:1") { - return gfx801; - } else if (sname == "AMD:AMDGPU:8:0:0" || sname == "AMD:AMDGPU:8:0:2") { - return gfx802; - } else if (sname == "AMD:AMDGPU:8:0:3" || sname == "AMD:AMDGPU:8:0:4") { - return gfx803; - } else if (sname == "AMD:AMDGPU:8:0:5") { - return gfx805; - } else if (sname == "AMD:AMDGPU:8:1:0") { - return gfx810; - } else if (sname == "AMD:AMDGPU:9:0:0" || sname == "AMD:AMDGPU:9:0:1") { - return gfx900; - } else if (sname == "AMD:AMDGPU:9:0:2" || sname == "AMD:AMDGPU:9:0:3") { - return gfx902; - } else if (sname == "AMD:AMDGPU:9:0:4" || sname == "AMD:AMDGPU:9:0:5") { - return gfx904; - } else if (sname == "AMD:AMDGPU:9:0:6" || sname == "AMD:AMDGPU:9:0:7") { - return gfx906; - } else if (sname == "AMD:AMDGPU:9:0:8") { - return gfx908; - } else if (sname == "AMD:AMDGPU:10:1:0") { - return gfx1010; - } else if (sname == "AMD:AMDGPU:10:1:1") { - return gfx1011; - } else if (sname == "AMD:AMDGPU:10:1:2") { - return gfx1012; - } else if (sname == "AMD:AMDGPU:10:3:0") { - return gfx1030; - } else if (sname == "AMD:AMDGPU:10:3:1") { - return gfx1031; - } else if (sname == "AMD:AMDGPU:10:3:2") { - return gfx1032; - } else if (sname == "AMD:AMDGPU:10:3:3") { - return gfx1033; - } - - // The offline loader only supports code object v2 which only supports - // asics up to gfx906. Do NOT add handling of new asics into this - // if-else-if* block. - assert(0); - return invalid; - } - - bool OfflineLoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) - { - return true; - } - - void* OfflineLoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) - { - void* ptr = alignedMalloc(size, align); - if (zero) { memset(ptr, 0, size); } - out << "SegmentAlloc: " << segment << ": " << "size=" << size << " align=" << align << " zero=" << zero << " result=" << ptr << std::endl; - pointers.insert(ptr); - return ptr; - } - - bool OfflineLoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) - { - out << "SegmentCopy: " << segment << ": " << "dst=" << dst << " offset=" << offset << " src=" << src << " size=" << size << std::endl; - if (!dst || !src || dst == src) { - return false; - } - if (0 == size) { - return true; - } - memcpy((char *) dst + offset, src, size); - return true; - } - - void OfflineLoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) - { - out << "SegmentFree: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; - pointers.erase(seg); - alignedFree(seg); - } - - void* OfflineLoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) - { - out << "SegmentAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; - return (char*) seg + offset; - } - - void* OfflineLoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) - { - out << "SegmentHostAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; - return (char*) seg + offset; - } - - bool OfflineLoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) - { - out << "SegmentFreeze: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; - return true; - } - - bool OfflineLoaderContext::ImageExtensionSupported() - { - return true; - } - - hsa_status_t OfflineLoaderContext::ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle) - { - void* ptr = alignedMalloc(256, 8); - out << "ImageCreate" << ":" << - " permission=" << image_permission << - " geometry=" << image_descriptor->geometry << - " width=" << image_descriptor->width << - " height=" << image_descriptor->height << - " depth=" << image_descriptor->depth << - " array_size=" << image_descriptor->array_size << - " channel_type=" << image_descriptor->format.channel_type << - " channel_order=" << image_descriptor->format.channel_order<< - " data=" << image_data << - std::endl; - pointers.insert(ptr); - image_handle->handle = reinterpret_cast(ptr); - return HSA_STATUS_SUCCESS; - } - - hsa_status_t OfflineLoaderContext::ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle) - { - void* ptr = reinterpret_cast(image_handle.handle); - pointers.erase(ptr); - alignedFree(ptr); - return HSA_STATUS_SUCCESS; - } - - hsa_status_t OfflineLoaderContext::SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) - { - void* ptr = alignedMalloc(256, 8); - out << "SamplerCreate" << ":" << - " coordinate_mode=" << sampler_descriptor->coordinate_mode << - " filter_mode=" << sampler_descriptor->filter_mode << - " address_mode=" << sampler_descriptor->address_mode << - std::endl; - pointers.insert(ptr); - sampler_handle->handle = reinterpret_cast(ptr); - return HSA_STATUS_SUCCESS; - } - - hsa_status_t OfflineLoaderContext::SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) - { - void* ptr = reinterpret_cast(sampler_handle.handle); - pointers.erase(ptr); - alignedFree(ptr); - return HSA_STATUS_SUCCESS; - } - -} // namespace loader -} // namespace hsa -} // namespace amd -} // namespace rocr diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp deleted file mode 100644 index ef6ef2a59..000000000 --- a/src/loader/loaders.hpp +++ /dev/null @@ -1,110 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LOADERS_HPP_ -#define LOADERS_HPP_ - -#include "core/inc/amd_hsa_loader.hpp" -#include -#include - -namespace rocr { -namespace amd { -namespace hsa { -namespace loader { - - class OfflineLoaderContext : public amd::hsa::loader::Context { - private: - hsa_isa_t invalid; - hsa_isa_t gfx700, gfx701, gfx702, gfx703, gfx704, gfx705; - hsa_isa_t gfx801, gfx802, gfx803, gfx805, gfx810; - hsa_isa_t gfx900, gfx902, gfx904, gfx906, gfx908; - hsa_isa_t gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, gfx1032, gfx1033; - std::ostream& out; - typedef std::set PointerSet; - PointerSet pointers; - - public: - OfflineLoaderContext(); - - hsa_isa_t IsaFromName(const char *name) override; - - bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; - - void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override; - - bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override; - - void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override; - - void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; - - void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; - - bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; - - bool ImageExtensionSupported() override; - - hsa_status_t ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle) override; - - hsa_status_t ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle) override; - - hsa_status_t SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) override; - - hsa_status_t SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; - }; -} // namespace loader -} // namespace hsa -} // namespace amd -} // namespace rocr - -#endif // LOADERS_HPP_