diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 321d6e98a..ea4f39820 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -65,6 +65,7 @@ include ( GNUInstallDirs ) if ( NOT DEFINED BUILD_SHARED_LIBS ) set ( BUILD_SHARED_LIBS ON ) endif() + set ( BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS} CACHE BOOL "Build shared library (.so) or not.") ## Adjust target name for static builds @@ -85,7 +86,7 @@ if (ROCM_CCACHE_BUILD) endif() # if (ROCM_CCACHE_BUILD) ## Get version strings -get_version ( "1.13.0" ) +get_version ( "1.14.0" ) if ( ${ROCM_PATCH_VERSION} ) set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) endif() @@ -158,7 +159,8 @@ set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY LINK_FLAGS ${HSA_SHARED_LINK ## ------------------------- End Compiler and Linker options ---------------------------- ## Source files. -set ( SRCS core/util/lnx/os_linux.cpp +set ( SRCS core/driver/driver.cpp + core/util/lnx/os_linux.cpp core/util/small_heap.cpp core/util/timer.cpp core/util/flag.cpp @@ -208,6 +210,16 @@ add_dependencies( ${CORE_RUNTIME_TARGET} amd_trap_handler_v2 ) add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/blit_shaders ) add_dependencies( ${CORE_RUNTIME_TARGET} amd_blit_shaders_v2) +option(PC_SAMPLING_SUPPORT "Enable PC Sampling Support" ON) + +if (${PC_SAMPLING_SUPPORT}) + target_compile_definitions(${CORE_RUNTIME_TARGET} PRIVATE HSA_PC_SAMPLING_SUPPORT) + + set( PCS_SRCS pcs/hsa_ven_amd_pc_sampling.cpp pcs/pcs_runtime.cpp ) + + target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${PCS_SRCS} ) +endif() + if ( NOT DEFINED IMAGE_SUPPORT AND CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64|loongarch64" ) set ( IMAGE_SUPPORT ON ) endif() @@ -228,6 +240,7 @@ if(${IMAGE_SUPPORT}) image/addrlib/src/core/addrlib.cpp image/addrlib/src/core/addrlib1.cpp image/addrlib/src/core/addrlib2.cpp + image/addrlib/src/core/addrlib3.cpp image/addrlib/src/core/addrobject.cpp image/addrlib/src/core/addrelemlib.cpp image/addrlib/src/r800/ciaddrlib.cpp @@ -236,6 +249,7 @@ if(${IMAGE_SUPPORT}) image/addrlib/src/gfx9/gfx9addrlib.cpp image/addrlib/src/gfx10/gfx10addrlib.cpp image/addrlib/src/gfx11/gfx11addrlib.cpp + image/addrlib/src/gfx12/gfx12addrlib.cpp image/device_info.cpp image/hsa_ext_image.cpp image/image_runtime.cpp @@ -244,6 +258,7 @@ if(${IMAGE_SUPPORT}) image/image_manager_ai.cpp image/image_manager_nv.cpp image/image_manager_gfx11.cpp + image/image_manager_gfx12.cpp image/image_lut_kv.cpp image/image_lut_gfx11.cpp image/blit_object_gfx7xx.cpp @@ -265,10 +280,12 @@ if(${IMAGE_SUPPORT}) ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx9 ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx10 ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx11 + ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx12 ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/r800 ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx9 ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx10 - ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx11 ) + ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx11 + ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx12 ) target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${IMAGE_SRCS} ) @@ -278,21 +295,27 @@ if(${IMAGE_SUPPORT}) endif() -## Link dependencies. -target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt::hsakmt PkgConfig::drm) target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE elf::elf dl pthread rt ) - -find_package(rocprofiler-register) -if(rocprofiler-register_FOUND) +# For static package rocprofiler-register dependency is not required +# Link to hsakmt target for shared library builds +# Link to hsakmt-staticdrm target for static library builds +if( BUILD_SHARED_LIBS ) + target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt::hsakmt PkgConfig::drm) + find_package(rocprofiler-register) + if(rocprofiler-register_FOUND) target_compile_definitions(${CORE_RUNTIME_TARGET} PRIVATE HSA_ROCPROFILER_REGISTER=1 HSA_VERSION_MAJOR=${VERSION_MAJOR} HSA_VERSION_MINOR=${VERSION_MINOR} HSA_VERSION_PATCH=${VERSION_PATCH}) target_link_libraries(${CORE_RUNTIME_TARGET} PRIVATE rocprofiler-register::rocprofiler-register) set(HSA_DEP_ROCPROFILER_REGISTER ON) -else() + else() set(HSA_DEP_ROCPROFILER_REGISTER OFF) -endif() + endif() # end rocprofiler-register_FOUND +else() + include_directories(${drm_INCLUDE_DIRS}) + target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt-staticdrm::hsakmt-staticdrm) +endif()#end BUILD_SHARED_LIBS ## Set the VERSION and SOVERSION values set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY VERSION "${SO_VERSION_STRING}" ) @@ -312,7 +335,7 @@ if( NOT ${BUILD_SHARED_LIBS} ) add_dependencies( ${CORE_RUNTIME_NAME} ${CORE_RUNTIME_TARGET} ) ## Add external link requirements. - target_link_libraries ( ${CORE_RUNTIME_NAME} INTERFACE hsakmt::hsakmt ) + target_link_libraries ( ${CORE_RUNTIME_NAME} INTERFACE hsakmt-staticdrm::hsakmt-staticdrm ) target_link_libraries ( ${CORE_RUNTIME_NAME} INTERFACE elf::elf dl pthread rt ) install ( TARGETS ${CORE_RUNTIME_NAME} EXPORT ${CORE_RUNTIME_NAME}Targets ) @@ -417,7 +440,6 @@ endif() ## Packaging directives set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build") set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") - ## Only pack the "binary" and "dev" components, post install script will add the directory link. set ( CPACK_COMPONENTS_ALL binary dev ) # ASAN Package will have libraries and license file @@ -518,5 +540,22 @@ endif() set ( CPACK_RPM_PACKAGE_PROVIDES "hsa-ext-rocr-dev hsa-rocr-dev" ) set ( CPACK_RPM_PACKAGE_OBSOLETES "hsa-ext-rocr-dev" ) +if( NOT BUILD_SHARED_LIBS ) + # Suffix package name with static + set ( CPACK_RPM_STATIC_PACKAGE_NAME "hsa-rocr-static-devel") + set ( CPACK_DEBIAN_STATIC_PACKAGE_NAME "hsa-rocr-static-dev") + set ( CPACK_COMPONENT_STATIC_DESCRIPTION "HSA (Heterogenous System Architecture) core runtime - Linux static libraries" ) + set ( CPACK_RPM_STATIC_PACKAGE_REQUIRES "${CPACK_RPM_BINARY_PACKAGE_REQUIRES}" ) + string ( APPEND CPACK_RPM_STATIC_PACKAGE_REQUIRES " hsakmt-roct-devel" ) + set ( CPACK_DEBIAN_STATIC_PACKAGE_DEPENDS "${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS}" ) + string ( APPEND CPACK_DEBIAN_STATIC_PACKAGE_DEPENDS ", hsakmt-roct-dev" ) +endif() ## Include packaging include ( CPack ) +# static package generation +# Group binary and dev component to single package +if( NOT BUILD_SHARED_LIBS ) + cpack_add_component_group("static") + cpack_add_component( binary GROUP static ) + cpack_add_component( dev GROUP static ) +endif() diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index 58b35f402..31196835c 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -1238,6 +1238,11 @@ hsa_status_t HSA_API hsa_amd_vmem_address_reserve(void** ptr, size_t size, uint6 return amdExtTable->hsa_amd_vmem_address_reserve_fn(ptr, size, address, flags); } +hsa_status_t HSA_API hsa_amd_vmem_address_reserve_align(void** ptr, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags) { + return amdExtTable->hsa_amd_vmem_address_reserve_align_fn(ptr, size, address, alignment, flags); +} + hsa_status_t HSA_API hsa_amd_vmem_address_free(void* ptr, size_t size) { return amdExtTable->hsa_amd_vmem_address_free_fn(ptr, size); } @@ -1298,6 +1303,11 @@ hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, si return amdExtTable->hsa_amd_agent_set_async_scratch_limit_fn(agent, threshold); } +hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* queue, + hsa_queue_info_attribute_t attribute, void* value) { + return amdExtTable->hsa_amd_queue_get_info_fn(queue, attribute, value); +} + // Tools only table interfaces. namespace rocr { diff --git a/src/core/driver/driver.cpp b/src/core/driver/driver.cpp new file mode 100644 index 000000000..02407e6b2 --- /dev/null +++ b/src/core/driver/driver.cpp @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/driver.h" + +#include +#include + +#include "inc/hsa.h" + +namespace rocr { +namespace core { + +Driver::Driver(const std::string devnode_name, Agent::DeviceType agent_device_type) + : agent_device_type_(agent_device_type), devnode_name_(devnode_name) { } + +hsa_status_t Driver::Open() +{ + fd_ = open(devnode_name_.c_str(), O_RDWR | O_CLOEXEC); + if (fd_ < 0) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Driver::Close() +{ + int ret(0); + if (fd_ > 0) { + ret = close(fd_); + fd_ = -1; + } + if (ret) { + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; +} + +} // namespace core +} // namespace rocr diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h index ddb8671c2..09f14f941 100644 --- a/src/core/inc/amd_aql_queue.h +++ b/src/core/inc/amd_aql_queue.h @@ -196,8 +196,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo /// @return hsa_status_t hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override; - /// @brief Submits a block of PM4 and waits until it has been executed. - void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override; + // @brief Submits a block of PM4 and waits until it has been executed. + void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, + hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE, + hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE, + hsa_signal_t* signal = NULL) override; /// @brief Enables/Disables profiling overrides SetProfiling from core::Queue void SetProfiling(bool enabled) override; @@ -208,6 +211,9 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo /// @brief Update signal value using Release semantics void StoreRelease(hsa_signal_value_t value) override; + /// @brief Provide information about the queue + hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override; + /// @brief Enable use of GWS from this queue. hsa_status_t EnableGWS(int gws_slot_count); @@ -246,9 +252,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo void FillBufRsrcWord3(); void FillBufRsrcWord3_Gfx10(); void FillBufRsrcWord3_Gfx11(); + void FillBufRsrcWord3_Gfx12(); void FillComputeTmpRingSize(); void FillAltComputeTmpRingSize(); void FillComputeTmpRingSize_Gfx11(); + void FillComputeTmpRingSize_Gfx12(); void FreeMainScratchSpace(); void FreeAltScratchSpace(); diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index b8a5a41f5..7b47fa01a 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -46,6 +46,7 @@ #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ #include +#include #include #include "hsakmt/hsakmt.h" @@ -59,6 +60,7 @@ #include "core/util/small_heap.h" #include "core/util/locks.h" #include "core/util/lazy_ptr.h" +#include "pcs/pcs_runtime.h" namespace rocr { namespace AMD { @@ -186,6 +188,24 @@ class GpuAgentInt : public core::Agent { // Only valid when async scratch reclaim is supported // @retval HSA_STATUS_SUCCESS if successful virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0; + + // @brief Iterate through supported PC Sampling configurations + // @retval HSA_STATUS_SUCCESS if successful + virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, + void* cb_data) = 0; + + virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, + pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) = 0; }; class GpuAgent : public GpuAgentInt { @@ -380,6 +400,9 @@ class GpuAgent : public GpuAgentInt { if (t0_.GPUClockCounter == t1_.GPUClockCounter) SyncClocks(); } + const size_t MAX_SCRATCH_APERTURE_PER_XCC = (1ULL << 32); + size_t MaxScratchDevice() const { return properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC; } + void ReserveScratch(); // @brief If agent supports it, release scratch memory for all AQL queues on this agent. @@ -408,6 +431,13 @@ class GpuAgent : public GpuAgentInt { const std::function& system_deallocator() const { return system_deallocator_; } + const std::function& + finegrain_allocator() const { + return finegrain_allocator_; + } + + const std::function& finegrain_deallocator() const { return finegrain_deallocator_; } + protected: // Sizes are in packets. static const uint32_t minAqlSize_ = 0x40; // 4KB min @@ -452,10 +482,25 @@ class GpuAgent : public GpuAgentInt { // @brief Binds the second-level trap handler to this node. void BindTrapHandler(); + hsa_status_t UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* stochastic_hosttrap_buffers); // @brief Override from core::Agent. hsa_status_t EnableDmaProfiling(bool enable) override; + hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, + void* cb_data); + hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, + pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session); + + static void PcSamplingThreadRun(void* agent); + void PcSamplingThread(); + // @brief Node properties. const HsaNodeProperties properties_; @@ -494,8 +539,9 @@ class GpuAgent : public GpuAgentInt { // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { - QueueUtility, // Cache management and device to {host,device} blit compute - QueueBlitOnly, // Host to device blit + QueueUtility, // Cache management and device to {host,device} blit compute + QueueBlitOnly, // Host to device blit + QueuePCSampling, // Dedicated high priority queue for PC Sampling QueueCount }; @@ -578,8 +624,8 @@ class GpuAgent : public GpuAgentInt { // @brief Setup GWS accessing queue. void InitGWS(); - // @brief Setup NUMA aware system memory allocator. - void InitNumaAllocator(); + // @brief Set-up memory allocators + void InitAllocators(); // @brief Initialize scratch handler thresholds void InitAsyncScratchThresholds(); @@ -654,6 +700,58 @@ class GpuAgent : public GpuAgentInt { std::function system_deallocator_; + // Fine grain allocator on this device + std::function finegrain_allocator_; + + std::function finegrain_deallocator_; + + void* trap_handler_tma_region_; + + /* PC Sampling fields - begin */ + /* 2nd level Trap handler code is based on the offsets within this structure */ + typedef struct { + uint64_t buf_write_val; + uint32_t buf_size; + uint32_t reserved0; + uint32_t buf_written_val0; + uint32_t buf_watermark0; + hsa_signal_t done_sig0; + uint32_t buf_written_val1; + uint32_t buf_watermark1; + hsa_signal_t done_sig1; + uint8_t reserved1[16]; + /* pc_sample_t buffer0[buf_size]; */ + /* pc_sample_t buffer1[buf_size]; */ + } pcs_hosttrap_sampling_data_t; + + typedef struct { + /* Hosttrap data - stored on device so that trap_handler code can access efficiently */ + pcs_hosttrap_sampling_data_t* device_data; + + /* Hosttrap host buffer - stored on host */ + uint8_t* host_buffer; + size_t host_buffer_size; + uint8_t* host_buffer_wrap_pos; + uint8_t* host_write_ptr; + uint8_t* host_read_ptr; + size_t lost_sample_count; + std::mutex host_buffer_mutex; + + uint32_t which_buffer; + uint64_t* old_val; + uint32_t* cmd_data; + size_t cmd_data_sz; + // signal to pass into ExecutePM4() so that we do not need to re-allocate a + // new signal on each call + hsa_signal_t exec_pm4_signal; + + os::Thread thread; + pcs::PcsRuntime::PcSamplingSession* session; + } pcs_hosttrap_t; + + pcs_hosttrap_t pcs_hosttrap_data_; + /* PC Sampling fields - end */ + // @brief device handle amdgpu_device_handle ldrm_dev_; diff --git a/src/core/inc/amd_gpu_pm4.h b/src/core/inc/amd_gpu_pm4.h index 7ebf0c399..65191d5bb 100644 --- a/src/core/inc/amd_gpu_pm4.h +++ b/src/core/inc/amd_gpu_pm4.h @@ -43,11 +43,19 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_PM4_H_ #define HSA_RUNTIME_CORE_INC_AMD_GPU_PM4_H_ + // clang-format off + #define PM4_HDR_IT_OPCODE_NOP 0x10 #define PM4_HDR_IT_OPCODE_INDIRECT_BUFFER 0x3F #define PM4_HDR_IT_OPCODE_RELEASE_MEM 0x49 #define PM4_HDR_IT_OPCODE_ACQUIRE_MEM 0x58 +#define PM4_HDR_IT_OPCODE_ATOMIC_MEM 0x1E +#define PM4_HDR_IT_OPCODE_WRITE_DATA 0x37 +#define PM4_HDR_IT_OPCODE_WAIT_REG_MEM 0x3C +#define PM4_HDR_IT_OPCODE_COPY_DATA 0x40 +#define PM4_HDR_IT_OPCODE_DMA_DATA 0x50 + #define PM4_HDR_SHADER_TYPE(x) (((x) & 0x1) << 1) #define PM4_HDR_IT_OPCODE(x) (((x) & 0xFF) << 8) #define PM4_HDR_COUNT(x) (((x) & 0x3FFF) << 16) @@ -82,4 +90,51 @@ #define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x) (((x) & 0xF) << 8) # define PM4_RELEASE_MEM_EVENT_INDEX_AQL 0x7 +#define PM4_ATOMIC_MEM_DW1_ATOMIC(x) (((x) & 0x7F) << 0) +# define PM4_ATOMIC_MEM_GL2_OP_ATOMIC_SWAP_RTN_64 (39 << 0) +#define PM4_ATOMIC_MEM_DW2_ADDR_LO(x) (((x) & 0xFFFFFFF8) << 0) +#define PM4_ATOMIC_MEM_DW3_ADDR_HI(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_ATOMIC_MEM_DW4_SRC_DATA_LO(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_ATOMIC_MEM_DW5_SRC_DATA_HI(x) (((x) & 0xFFFFFFFF) << 0) + +#define PM4_COPY_DATA_DW1(x) (((x) & 0xFFFFFFFF) << 0) +# define PM4_COPY_DATA_SRC_SEL_ATOMIC_RETURN_DATA (6 << 0) +# define PM4_COPY_DATA_DST_SEL_TC_12 (2 << 8) +# define PM4_COPY_DATA_COUNT_SEL (1 << 16) +# define PM4_COPY_DATA_WR_CONFIRM (1 << 20) +#define PM4_COPY_DATA_DW4_DST_ADDR_LO(x) (((x) & 0xFFFFFFF8) << 0) +#define PM4_COPY_DATA_DW5_DST_ADDR_HI(x) (((x) & 0xFFFFFFFF) << 0) + +#define PM4_WAIT_REG_MEM_DW1(x) (((x) & 0xFFFFFFFF) << 0) +# define PM4_WAIT_REG_MEM_FUNCTION_EQUAL_TO_REFERENCE (3 << 0) +# define PM4_WAIT_REG_MEM_MEM_SPACE_MEMORY_SPACE (1 << 4) +# define PM4_WAIT_REG_MEM_OPERATION_WAIT_REG_MEM (0 << 6) +#define PM4_WAIT_REG_MEM_DW2_MEM_POLL_ADDR_LO(x) (((x) & 0xFFFFFFFC) << 0) +#define PM4_WAIT_REG_MEM_DW3_MEM_POLL_ADDR_HI(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_WAIT_REG_MEM_DW4_REFERENCE(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_WAIT_REG_MEM_DW6(x) (((x) & 0x8000FFFF) << 0) +# define PM4_WAIT_REG_MEM_POLL_INTERVAL(x) (((x) & 0xFFFF) << 0) +# define PM4_WAIT_REG_MEM_OPTIMIZE_ACE_OFFLOAD_MODE (1 << 31) + +#define PM4_DMA_DATA_DW1(x) (((x) & 0xFFFFFFFF) << 0) +# define PM4_DMA_DATA_DST_SEL_DST_ADDR_USING_L2 (3 << 20) +# define PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2 (3 << 29) +#define PM4_DMA_DATA_DW2_SRC_ADDR_LO(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_DMA_DATA_DW3_SRC_ADDR_HI(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_DMA_DATA_DW4_DST_ADDR_LO(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_DMA_DATA_DW5_DST_ADDR_HI(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_DMA_DATA_DW6(x) (((x) & 0xFFFFFFFF) << 0) +# define PM4_DMA_DATA_BYTE_COUNT(x) (((x) & 0x3FFFFFF) << 0) +# define PM4_DMA_DATA_DIS_WC (1 << 31) +# define PM4_DMA_DATA_DIS_WC_LAST (0 << 31) + +#define PM4_WRITE_DATA_DW1(x) (((x) & 0xFFFFFF00) << 0) +# define PM4_WRITE_DATA_DST_SEL_TC_L2 (2 << 8) +# define PM4_WRITE_DATA_WR_CONFIRM_WAIT_CONFIRMATION (1 << 20) +#define PM4_WRITE_DATA_DW2_DST_MEM_ADDR_LO(x) (((x) & 0xFFFFFFFC) << 0) +#define PM4_WRITE_DATA_DW3_DST_MEM_ADDR_HI(x) (((x) & 0xFFFFFFFF) << 0) +#define PM4_WRITE_DATA_DW4_DATA(x) (((x) & 0xFFFFFFFF) << 0) + +// clang-format on + #endif // header guard diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h deleted file mode 100644 index e5ee1c4ed..000000000 --- a/src/core/inc/amd_gpu_shaders.h +++ /dev/null @@ -1,901 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_SHADERS_H_ -#define HSA_RUNTIME_CORE_INC_AMD_GPU_SHADERS_H_ - -namespace rocr { -namespace AMD { - -static const unsigned int kCodeCopyAligned7[] = { - 0xC0820100, 0xC0840104, 0xC0860108, 0xC088010C, 0xC08A0110, 0xC00C0114, - 0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900, - 0xD2506A03, 0x01A90103, 0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05, - 0x01A90105, 0xD1C2006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, 0xDC200000, - 0x01000002, 0xBF8C0F70, 0xD24A6A02, 0x00003102, 0xD2506A03, 0x01A90103, - 0xDC600000, 0x00000104, 0xD24A6A04, 0x00003104, 0xD2506A05, 0x01A90105, - 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, 0xD24A6A02, - 0x00001101, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001501, - 0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000E, 0xDC380000, - 0x08000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70, - 0xDC780000, 0x00000804, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105, - 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD24A6A02, 0x00001901, - 0xD2506A03, 0x01A90103, 0x7E0A020F, 0xD24A6A04, 0x00001D01, 0xD2506A05, - 0x01A90105, 0xD1C2006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, 0xDC300000, - 0x01000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70, - 0xDC700000, 0x00000104, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105, - 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD24A6A02, 0x00002100, 0xD2506A03, - 0x01A90103, 0x7E0A0213, 0xD24A6A04, 0x00002500, 0xD2506A05, 0x01A90105, - 0xD1C2006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, 0x01000002, - 0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000, -}; - -static const unsigned int kCodeCopyMisaligned7[] = { - 0xC0820100, 0xC0840104, 0xC0860108, 0xC008010C, 0xBF8C007F, 0x8F028602, - 0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900, 0xD2506A03, 0x01A90103, - 0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05, 0x01A90105, 0xD1C2006A, - 0x00001102, 0xBF860032, 0xDC200000, 0x06000002, 0xD24A6A02, 0x00002102, - 0xD2506A03, 0x01A90103, 0xDC200000, 0x07000002, 0xD24A6A02, 0x00002102, - 0xD2506A03, 0x01A90103, 0xDC200000, 0x08000002, 0xD24A6A02, 0x00002102, - 0xD2506A03, 0x01A90103, 0xDC200000, 0x09000002, 0xD24A6A02, 0x00002102, - 0xD2506A03, 0x01A90103, 0xBF8C0F70, 0xDC600000, 0x00000604, 0xD24A6A04, - 0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000704, 0xD24A6A04, - 0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000804, 0xD24A6A04, - 0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000904, 0xD24A6A04, - 0x00002104, 0xD2506A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD24A6A02, - 0x00001100, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001500, - 0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, - 0xDC200000, 0x01000002, 0xD24A6A02, 0x00002102, 0xD2506A03, 0x01A90103, - 0xBF8C0F70, 0xDC600000, 0x00000104, 0xD24A6A04, 0x00002104, 0xD2506A05, - 0x01A90105, 0xBF82FFEE, 0xBF810000, -}; - -static const unsigned int kCodeFill7[] = { - 0xC0820100, 0xC0840104, 0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E08020A, - 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8F0C840B, 0x34020084, 0x7E060205, - 0xD24A6A02, 0x00000901, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00000D02, - 0xBF860007, 0xDC780000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03, - 0x01A90103, 0xBF82FFF6, 0x8F0C820B, 0x34020082, 0x7E060207, 0xD24A6A02, - 0x00000D01, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00001102, 0xBF860008, - 0x87FE6A7E, 0xDC700000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03, - 0x01A90103, 0xBF82FFF5, 0xBF810000, -}; - -static const unsigned int kCodeTrapHandler8[] = { - 0xC0061C80, 0x000000C0, 0xBF8C007F, 0xBEFE0181, 0x80728872, 0x82738073, - 0x7E000272, 0x7E020273, 0x7E0402FF, 0x80000000, 0x7E060280, 0xDD800000, - 0x00000200, 0xBF8C0F70, 0x7DD40500, 0xBF870011, 0xC0061D39, 0x00000008, - 0xBF8C007F, 0x86F47474, 0xBF84000C, 0x80729072, 0x82738073, 0xC0021CB9, - 0x00000000, 0xBF8C007F, 0x7E000274, 0x7E020275, 0x7E040272, 0xDC700000, - 0x00000200, 0xBF8C0F70, 0xBF900001, 0xBF8D0001, 0xBE801F70, -}; - -static const unsigned int kCodeTrapHandler9[] = { -/* - .set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF - .set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16 - .set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 - .set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) - .set SQ_WAVE_PC_HI_HT_MASK , 0x1000000 - .set SQ_WAVE_STATUS_HALT_BIT , 13 - .set SQ_WAVE_STATUS_HALT_BFE , (SQ_WAVE_STATUS_HALT_BIT | (1 << 16)) - .set SQ_WAVE_TRAPSTS_ADDRESS_WATCH_MASK , 0x7080 - .set SQ_WAVE_TRAPSTS_MEM_VIOL_MASK , 0x100 - .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK , 0x800 - .set SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK , 0x10000000 - .set SQ_WAVE_MODE_DEBUG_EN_SHIFT , 11 - .set SIGNAL_CODE_MEM_VIOL , (1 << 29) - .set SIGNAL_CODE_ILLEGAL_INST , (1 << 30) - .set SIGNAL_CODE_LLVM_TRAP , (1 << 31) - .set MAX_NUM_DOORBELLS_MASK , ((1 << 10) - 1) - .set SENDMSG_M0_DOORBELL_ID_BITS , 12 - .set SENDMSG_M0_DOORBELL_ID_MASK , ((1 << SENDMSG_M0_DOORBELL_ID_BITS) - 1) - - .set TTMP7_DISPATCH_ID_CONVERTED_BIT , 31 - .set TTMP7_WAVE_STOPPED_BIT , 30 - .set TTMP7_SAVED_STATUS_HALT_BIT , 29 - .set TTMP7_SAVED_TRAP_ID_SHIFT , 25 - .set TTMP7_SAVED_TRAP_ID_BITS , 4 - .set TTMP7_SAVED_TRAP_ID_MASK , ((1 << TTMP7_SAVED_TRAP_ID_BITS) - 1) - .set TTMP7_PACKET_INDEX_BITS , 25 - .set TTMP7_PACKET_INDEX_MASK , ((1 << TTMP7_PACKET_INDEX_BITS) - 1) - .set TTMP11_PC_HI_SHIFT , 7 - - .if .amdgcn.gfx_generation_number == 9 - .set DEBUG_INTERRUPT_CONTEXT_ID_BIT , 23 - .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 - .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 - .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000 - .elseif .amdgcn.gfx_generation_number == 10 - .set DEBUG_INTERRUPT_CONTEXT_ID_BIT , 22 - .set TTMP11_SAVE_REPLAY_W64H_SHIFT , 31 - .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 24 - .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT , 25 - .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 - .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000 - .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK , 0x2000000 - .else - .error "unsupported target" - .endif - - // ABI between first and second level trap handler: - // ttmp0 = PC[31:0] - // ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] - // ttmp12 = SQ_WAVE_STATUS - // ttmp14 = TMA[31:0] - // ttmp15 = TMA[63:32] - // gfx9: - // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[18:0], NoScratch[0], WaveIdInWG[5:0] - // gfx10: - // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[16:0], NoScratch[0], WaveIdInWG[5:0] - - .macro mGetDoorbellId - s_mov_b32 exec_lo, 0x80000000 - s_sendmsg sendmsg(MSG_GET_DOORBELL) - .wait_sendmsg_\@: - s_nop 7 - s_bitcmp0_b32 exec_lo, 0x1F - s_cbranch_scc0 .wait_sendmsg_\@ - .endm - - .macro mExitTrap - // Restore SQ_WAVE_IB_STS. - .if .amdgcn.gfx_generation_number == 9 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 - .endif - .if .amdgcn.gfx_generation_number == 10 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK - s_or_b32 ttmp2, ttmp2, ttmp3 - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 - .endif - - // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 - - // Return to shader at unmodified PC. - s_rfe_b64 [ttmp0, ttmp1] - .endm - - trap_entry: - s_andn2_b32 ttmp7, ttmp7, (TTMP7_SAVED_TRAP_ID_MASK << TTMP7_SAVED_TRAP_ID_SHIFT) | (1 << TTMP7_SAVED_STATUS_HALT_BIT) - - // Save the entry status.halt in ttmp7.saved_status_halt - s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE - s_lshl_b32 ttmp2, ttmp2, TTMP7_SAVED_STATUS_HALT_BIT - s_or_b32 ttmp7, ttmp7, ttmp2 - - // If trap raised (non-zero trap id) then branch. - s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_cbranch_scc1 .trap_raised - - // If non-masked exception raised then branch. - s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) - s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK) - s_cbranch_scc1 .excp_raised - - .signal_debugger: - // Fetch doorbell index for our queue. - s_mov_b32 ttmp2, exec_lo - s_mov_b32 ttmp3, exec_hi - mGetDoorbellId - s_mov_b32 exec_hi, ttmp3 - - // Restore exec_lo, move the doorbell_id into ttmp3 - s_and_b32 ttmp3, exec_lo, SENDMSG_M0_DOORBELL_ID_MASK - s_mov_b32 exec_lo, ttmp2 - - // Set the debug interrupt context id. - // FIXME: Make conditional when exceptions are handled. - s_bitset1_b32 ttmp3, DEBUG_INTERRUPT_CONTEXT_ID_BIT - - // Send an interrupt to trigger event notification. - s_mov_b32 ttmp2, m0 - s_mov_b32 m0, ttmp3 - s_nop 0x0 // Manually inserted wait states - s_sendmsg sendmsg(MSG_INTERRUPT) - - // Restore m0 - s_mov_b32 m0, ttmp2 - - // Parking the wave requires saving the original pc in the preserved ttmps. - // Since all ttmps are used, we must first free ttmp6 by compressing the - // 40bit dispatch ptr in ttmp6:7 into a 25bit queue packet id. - // - // Register layout before parking the wave: - // - // ttmp6: dispatch_ptr[31:6] 0[5:0] - // ttmp7: 0[0] wave_stopped[0] status_halt[0] trap_id[3:0] 0[16:0] dispatch_ptr[39:32] - // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0] - // - // After parking the wave: - // - // ttmp6: pc_lo[31:0] - // ttmp7: 1[0] wave_stopped[0] status_halt[0] trap_id[3:0] packet_id[24:0] - // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] - // - // The conversion from dispatch ptr to queue packet index only needs to be - // done once, the first time the wave executes the trap handler. - - .if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10) - s_branch .halt_wave - .else - s_bitcmp1_b32 ttmp7, TTMP7_DISPATCH_ID_CONVERTED_BIT - s_cbranch_scc1 .ttmp7_has_dispatch_index - - s_and_b32 ttmp3, ttmp3, MAX_NUM_DOORBELLS_MASK - s_lshl_b32 ttmp3, ttmp3, 0x3 - - // Map doorbell index to amd_queue_t* through TMA (doorbell_queue_map). - s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], ttmp3 glc - s_waitcnt lgkmcnt(0) - - // Retrieve queue base_address from hsa_queue_t*. - s_load_dword ttmp2, [ttmp2, ttmp3], 0x8 glc - s_waitcnt lgkmcnt(0) - - // The dispatch index is (dispatch_ptr.lo - base_address.lo) >> 6 - s_sub_u32 ttmp2, ttmp6, ttmp2 - s_lshr_b32 ttmp2, ttmp2, 0x6 - s_andn2_b32 ttmp7, ttmp7, TTMP7_PACKET_INDEX_MASK - s_or_b32 ttmp7, ttmp7, ttmp2 - s_bitset1_b32 ttmp7, TTMP7_DISPATCH_ID_CONVERTED_BIT - - .ttmp7_has_dispatch_index: - // Save the PC - s_mov_b32 ttmp6, ttmp0 - s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK - s_lshl_b32 ttmp1, ttmp1, TTMP11_PC_HI_SHIFT - s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT) - s_or_b32 ttmp11, ttmp11, ttmp1 - - // Park the wave - s_getpc_b64 [ttmp0, ttmp1] - s_add_u32 ttmp0, ttmp0, .parked - . - s_addc_u32 ttmp1, ttmp1, 0x0 - s_branch .halt_wave - - .parked: - s_trap 0x2 - s_branch .parked - .endif - - .excp_raised: - // If memory violation without XNACK error then signal queue error. - // XNACK error will be handled by VM interrupt, since it has more information. - s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK) - s_cmp_eq_u32 ttmp3, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_mov_b32 ttmp3, SIGNAL_CODE_MEM_VIOL - s_cbranch_scc1 .signal_error - - // If illegal instruction then signal queue error. - s_and_b32 ttmp3, ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK - s_mov_b32 ttmp3, SIGNAL_CODE_ILLEGAL_INST - s_cbranch_scc1 .signal_error - - // Otherwise (memory violation with XNACK error) return to shader. Do not - // send a signal as that will cause an interrupt storm. Instead let the - // interrupt generated by the TLB miss cause the kernel to notify ROCr and - // put the queue into an error state. This also ensures the TLB interrupt - // is received which provides information about the page causing the fault. - s_branch .halt_wave - - .trap_raised: - // Save the entry trap id in ttmp7.saved_trap_id - s_min_u32 ttmp3, ttmp2, 0xF - s_lshl_b32 ttmp3, ttmp3, TTMP7_SAVED_TRAP_ID_SHIFT - s_or_b32 ttmp7, ttmp7, ttmp3 - - // If debugger trap (s_trap >= 3) then signal debugger. - s_cmp_ge_u32 ttmp2, 0x3; - s_cbranch_scc1 .signal_debugger - - // If llvm.trap (s_trap 2) then signal queue error. - s_cmp_eq_u32 ttmp2, 0x2 - s_mov_b32 ttmp3, SIGNAL_CODE_LLVM_TRAP - s_cbranch_scc1 .signal_error - - // For other traps advance PC and return to shader. - s_add_u32 ttmp0, ttmp0, 0x4 - s_addc_u32 ttmp1, ttmp1, 0x0 - s_branch .exit_trap - - .signal_error: - .if (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) - // This needs to be rewritten for gfx10.3 as scalar stores are not available. - .else - // FIXME: don't trash ttmp4/ttmp5 when exception handling is unified. - s_mov_b32 ttmp4, ttmp3 - - // Fetch doorbell index for our queue. - s_mov_b32 ttmp2, exec_lo - s_mov_b32 ttmp3, exec_hi - mGetDoorbellId - s_mov_b32 exec_hi, ttmp3 - - // Restore exec_lo, move the doorbell index into ttmp3 - s_and_b32 exec_lo, exec_lo, MAX_NUM_DOORBELLS_MASK - s_lshl_b32 ttmp3, exec_lo, 0x3 - s_mov_b32 exec_lo, ttmp2 - - // Map doorbell index to amd_queue_t* through TMA (doorbell_queue_map). - s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], ttmp3 glc - s_waitcnt lgkmcnt(0) - - // Retrieve queue_inactive_signal from amd_queue_t*. - s_load_dwordx2 [ttmp2, ttmp3], [ttmp2, ttmp3], 0xC0 glc - s_waitcnt lgkmcnt(0) - - // Set queue signal value to error code. - s_mov_b32 ttmp5, 0x0 - s_atomic_swap_x2 [ttmp4, ttmp5], [ttmp2, ttmp3], 0x8 glc - s_waitcnt lgkmcnt(0) - - // Skip event trigger if the signal value was already non-zero. - s_or_b32 ttmp4, ttmp4, ttmp5 - s_cbranch_scc1 .skip_event_trigger - - // Check for a non-NULL signal event mailbox. - s_load_dwordx2 [ttmp4, ttmp5], [ttmp2, ttmp3], 0x10 glc - s_waitcnt lgkmcnt(0) - s_and_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], [ttmp4, ttmp5] - s_cbranch_scc0 .skip_event_trigger - - // Load the signal event value. - s_load_dword ttmp2, [ttmp2, ttmp3], 0x18 glc - s_waitcnt lgkmcnt(0) - - // Write the signal event value to the mailbox. - s_store_dword ttmp2, [ttmp4, ttmp5], 0x0 glc - s_waitcnt lgkmcnt(0) - - // Send an interrupt to trigger event notification. - s_mov_b32 m0, 0x0 - s_nop 0 - s_sendmsg sendmsg(MSG_INTERRUPT) - .endif - - .skip_event_trigger: - // Since we trashed ttmp4/ttmp5, reset the wave_id to 0 - s_mov_b32 ttmp4, 0x0 - s_mov_b32 ttmp5, 0x0 - - .halt_wave: - s_bitset1_b32 ttmp7, TTMP7_WAVE_STOPPED_BIT - - // Halt the wavefront. - s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_BIT - - .exit_trap: - mExitTrap -*/ - 0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73, - 0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900, - 0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, - 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff, - 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, - 0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f, - 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f, - 0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f, - 0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, - 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, - 0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, - 0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4, - 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e, - 0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, - 0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, - 0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f, - 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, - 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, - 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, - 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, - 0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, - 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, -}; - -static const unsigned int kCodeTrapHandler90a[] = { - 0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73, - 0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900, - 0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a, - 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff, - 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, - 0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f, - 0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f, - 0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f, - 0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, - 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, - 0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, - 0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4, - 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e, - 0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, - 0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, - 0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f, - 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, - 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, - 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, - 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, - 0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, - 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, -}; - -static const unsigned int kCodeCopyAligned8[] = { - 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, - 0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050, - 0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205, 0xD1196A02, 0x00000900, - 0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04, 0x00000D00, 0xD11C6A05, - 0x01A90105, 0xD0E9006A, 0x00001102, 0xBF86000F, 0x86FE6A7E, 0xDC400000, - 0x01000002, 0xBF8C0F70, 0xD1196A02, 0x00003102, 0xD11C6A03, 0x01A90103, - 0xDC600000, 0x00000104, 0xD1196A04, 0x00003104, 0xD11C6A05, 0x01A90105, - 0xBF82FFEE, 0xBEFE01C1, 0x8E198418, 0x24020084, 0x7E060209, 0xD1196A02, - 0x00001101, 0xD11C6A03, 0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001501, - 0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001902, 0xBF86000E, 0xDC5C0000, - 0x08000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, - 0xDC7C0000, 0x00000804, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105, - 0xBF82FFEF, 0x8E198218, 0x24020082, 0x7E06020D, 0xD1196A02, 0x00001901, - 0xD11C6A03, 0x01A90103, 0x7E0A020F, 0xD1196A04, 0x00001D01, 0xD11C6A05, - 0x01A90105, 0xD0E9006A, 0x00002102, 0xBF86000F, 0x86FE6A7E, 0xDC500000, - 0x01000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, - 0xDC700000, 0x00000104, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105, - 0xBF82FFEE, 0xBEFE01C1, 0x7E060211, 0xD1196A02, 0x00002100, 0xD11C6A03, - 0x01A90103, 0x7E0A0213, 0xD1196A04, 0x00002500, 0xD11C6A05, 0x01A90105, - 0xD0E9006A, 0x00002902, 0xBF860006, 0x86FE6A7E, 0xDC400000, 0x01000002, - 0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000, -}; - -static const unsigned int kCodeCopyMisaligned8[] = { - 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020, - 0xC0020400, 0x00000030, 0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205, - 0xD1196A02, 0x00000900, 0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04, - 0x00000D00, 0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001102, 0xBF860032, - 0xDC400000, 0x06000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, - 0xDC400000, 0x07000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, - 0xDC400000, 0x08000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, - 0xDC400000, 0x09000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, - 0xBF8C0F70, 0xDC600000, 0x00000604, 0xD1196A04, 0x00002104, 0xD11C6A05, - 0x01A90105, 0xDC600000, 0x00000704, 0xD1196A04, 0x00002104, 0xD11C6A05, - 0x01A90105, 0xDC600000, 0x00000804, 0xD1196A04, 0x00002104, 0xD11C6A05, - 0x01A90105, 0xDC600000, 0x00000904, 0xD1196A04, 0x00002104, 0xD11C6A05, - 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD1196A02, 0x00001100, 0xD11C6A03, - 0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001500, 0xD11C6A05, 0x01A90105, - 0xD0E9006A, 0x00001902, 0xBF86000F, 0x86FE6A7E, 0xDC400000, 0x01000002, - 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, 0xDC600000, - 0x00000104, 0xD1196A04, 0x00002104, 0xD11C6A05, 0x01A90105, 0xBF82FFEE, - 0xBF810000, -}; - -static const unsigned int kCodeFill8[] = { - 0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xBF8C007F, 0x8E028602, - 0x32000002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8E0C840B, - 0x24020084, 0x7E060205, 0xD1196A02, 0x00000901, 0xD11C6A03, 0x01A90103, - 0xD0E9006A, 0x00000D02, 0xBF860007, 0xDC7C0000, 0x00000402, 0xD1196A02, - 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF6, 0x8E0C820B, 0x24020082, - 0x7E060207, 0xD1196A02, 0x00000D01, 0xD11C6A03, 0x01A90103, 0xD0E9006A, - 0x00001102, 0xBF860008, 0x86FE6A7E, 0xDC700000, 0x00000402, 0xD1196A02, - 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, -}; - -static const unsigned int kCodeCopyAligned10[] = { - 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, - 0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050, - 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02, - 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006, - 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, - 0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03, - 0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05, - 0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, - 0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, - 0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E, - 0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103, - 0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05, - 0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02, - 0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E, - 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, - 0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103, - 0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05, - 0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010, - 0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05, - 0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, - 0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000, -}; - -static const unsigned int kCodeCopyMisaligned10[] = { - 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, - 0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, - 0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, - 0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, - 0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008, - 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05, - 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000, - 0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70, - 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105, - 0xBF82FFEE, 0xBF810000, -}; - -static const unsigned int kCodeFill10[] = { - 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602, - 0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, - 0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03, - 0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402, - 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B, - 0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103, - 0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402, - 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, -}; - -static const unsigned int kCodeTrapHandler1010[] = { - 0x8a73ff73, 0x3e000000, 0x93eeff78, 0x0001000d, 0x8f6e9d6e, 0x88736e73, - 0x93eeff6d, 0x00080010, 0xbf850041, 0xb96ef803, 0x876fff6e, 0x00000900, - 0xbf850031, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a, - 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x876fff7e, 0x00000fff, - 0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, - 0xbefc036e, 0xbf0d9f73, 0xbf85000f, 0x876fff6f, 0x000003ff, 0x8f6f836f, - 0xf4051bbd, 0xde000000, 0xbf8cc07f, 0xf4011bb7, 0xfa000008, 0xbf8cc07f, - 0x80ee6e72, 0x906e866e, 0x8a73ff73, 0x01ffffff, 0x88736e73, 0xbef31d9f, - 0xbef2036c, 0x876dff6d, 0x0000ffff, 0x8f6d876d, 0x8a77ff77, 0x007fff80, - 0x88776d77, 0xbeec1f00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044, - 0xbf920002, 0xbf82fffe, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, - 0xbeef03ff, 0x20000000, 0xbf850011, 0x876fff6e, 0x00000800, 0xbeef03f4, - 0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8f6f996f, 0x88736f73, 0xbf09836e, - 0xbf85ffbe, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c, - 0x826d806d, 0xbf82002c, 0xbef0036f, 0xbeee037e, 0xbeef037f, 0xbefe03ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, - 0x877eff7e, 0x000003ff, 0x8f6f837e, 0xbefe036e, 0xf4051bbd, 0xde000000, - 0xbf8cc07f, 0xf4051bb7, 0xfa0000c0, 0xbf8cc07f, 0xbef10380, 0xf6811c37, - 0xfa000008, 0xbf8cc07f, 0x88707170, 0xbf85000e, 0xf4051c37, 0xfa000010, - 0xbf8cc07f, 0x87f07070, 0xbf840009, 0xf4011bb7, 0xfa000018, 0xbf8cc07f, - 0xf4411bb8, 0xfa000000, 0xbf8cc07f, 0xbefc0380, 0xbf800000, 0xbf900001, - 0xbef00380, 0xbef10380, 0xbef31d9e, 0xbef81d8d, 0x906e8977, 0x876fff6e, - 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, 0xb9eef807, - 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, -}; - -static const unsigned int kCodeTrapHandler10[] = { - 0x8a73ff73, 0x3e000000, 0x93eeff78, 0x0001000d, 0x8f6e9d6e, 0x88736e73, - 0x93eeff6d, 0x00080010, 0xbf850023, 0xb96ef803, 0x876fff6e, 0x00000900, - 0xbf850013, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a, - 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x876fff7e, 0x00000fff, - 0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, - 0xbefc036e, 0xbf82001a, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, - 0xbeef03ff, 0x20000000, 0xbf850011, 0x876fff6e, 0x00000800, 0xbeef03f4, - 0xbf85000d, 0xbf82000e, 0x83ef8f6e, 0x8f6f996f, 0x88736f73, 0xbf09836e, - 0xbf85ffdc, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c, - 0x826d806d, 0xbf820004, 0xbef00380, 0xbef10380, 0xbef31d9e, 0xbef81d8d, - 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, - 0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, -}; - -/* -.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF -.set SQ_WAVE_PC_HI_HT_SHIFT , 24 -.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16 -.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 -.set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) -.set SQ_WAVE_STATUS_HALT_SHIFT , 13 -.set SQ_WAVE_STATUS_HALT_BFE , (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16)) -.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT , 8 -.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11 -.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28 -.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F -.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12 -.set TRAP_ID_ABORT , 2 -.set TRAP_ID_DEBUGTRAP , 3 -.set DOORBELL_ID_SIZE , 10 -.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1) -.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0)) -.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1)) -.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2)) -.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3)) -.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4)) -.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5)) - -.set TTMP6_WAVE_STOPPED_SHIFT , 30 -.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29 -.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) -.set TTMP6_SAVED_TRAP_ID_SHIFT , 25 -.set TTMP6_SAVED_TRAP_ID_SIZE , 4 -.set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) -.set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) -.set TTMP11_PC_HI_SHIFT , 7 -.set TTMP11_DEBUG_ENABLED_SHIFT , 23 - -.if .amdgcn.gfx_generation_number == 9 - .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 - .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 - .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000 -.elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 - .set TTMP11_SAVE_REPLAY_W64H_SHIFT , 31 - .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 24 - .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT , 25 - .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 - .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000 - .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK , 0x2000000 -.endif - -// ABI between first and second level trap handler: -// ttmp0 = PC[31:0] -// ttmp12 = SQ_WAVE_STATUS -// ttmp14 = TMA[31:0] -// ttmp15 = TMA[63:32] -// gfx9: -// ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] -// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] -// gfx10: -// ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32] -// gfx1010: -// ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] -// gfx1030: -// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] - -trap_entry: - // Branch if not a trap (an exception instead). - s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_cbranch_scc0 .no_skip_debugtrap - - // If caused by s_trap then advance PC. - s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT - s_cbranch_scc1 .not_s_trap - s_add_u32 ttmp0, ttmp0, 0x4 - s_addc_u32 ttmp1, ttmp1, 0x0 - -.not_s_trap: - // If llvm.debugtrap and debugger is not attached. - s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP - s_cbranch_scc0 .no_skip_debugtrap - s_bitcmp0_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT - s_cbranch_scc0 .no_skip_debugtrap - - // Ignore llvm.debugtrap. - s_branch .exit_trap - -.no_skip_debugtrap: - // Save trap id and halt status in ttmp6. - s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) - s_min_u32 ttmp2, ttmp2, 0xF - s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT - s_or_b32 ttmp6, ttmp6, ttmp2 - s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE - s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT - s_or_b32 ttmp6, ttmp6, ttmp2 - - // Fetch doorbell id for our queue. - s_mov_b32 ttmp2, exec_lo - s_mov_b32 ttmp3, exec_hi - s_mov_b32 exec_lo, 0x80000000 - s_sendmsg sendmsg(MSG_GET_DOORBELL) -.wait_sendmsg: - s_nop 0x7 - s_bitcmp0_b32 exec_lo, 0x1F - s_cbranch_scc0 .wait_sendmsg - s_mov_b32 exec_hi, ttmp3 - - // Restore exec_lo, move the doorbell_id into ttmp3 - s_and_b32 ttmp3, exec_lo, DOORBELL_ID_MASK - s_mov_b32 exec_lo, ttmp2 - - // Map trap reason to an exception code. - s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) - - s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT - s_cbranch_scc0 .not_memory_violation - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 - - // Aperture violation requires XNACK_ERROR == 0. - s_branch .not_aperture_violation - -.not_memory_violation: - s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT - s_cbranch_scc0 .not_aperture_violation - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 - -.not_aperture_violation: - s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT - s_cbranch_scc0 .not_illegal_instruction - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 - -.not_illegal_instruction: - s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP - s_cbranch_scc0 .not_math_exception - s_getreg_b32 ttmp7, hwreg(HW_REG_MODE) - s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT - s_and_b32 ttmp2, ttmp2, ttmp7 - s_cbranch_scc0 .not_math_exception - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 - -.not_math_exception: - s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE - s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT - s_cbranch_scc0 .not_abort_trap - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 - -.not_abort_trap: - // If no other exception was flagged then report a generic error. - s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK - s_cbranch_scc1 .send_interrupt - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 - -.send_interrupt: - // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id - s_mov_b32 ttmp2, m0 - s_mov_b32 m0, ttmp3 - s_nop 0x0 // Manually inserted wait states - s_sendmsg sendmsg(MSG_INTERRUPT) - s_mov_b32 m0, ttmp2 - - // Parking the wave requires saving the original pc in the preserved ttmps. - // Register layout before parking the wave: - // - // ttmp7: 0[31:0] - // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0] - // - // After parking the wave: - // - // ttmp7: pc_lo[31:0] - // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] - -.if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10) - s_branch .halt_wave -.else - // Save the PC - s_mov_b32 ttmp7, ttmp0 - s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK - s_lshl_b32 ttmp1, ttmp1, TTMP11_PC_HI_SHIFT - s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT) - s_or_b32 ttmp11, ttmp11, ttmp1 - - // Park the wave - s_getpc_b64 [ttmp0, ttmp1] - s_add_u32 ttmp0, ttmp0, .parked - . - s_addc_u32 ttmp1, ttmp1, 0x0 - s_branch .halt_wave - -.parked: - s_trap 0x2 - s_branch .parked -.endif - -.halt_wave: - // Halt the wavefront upon restoring STATUS below. - s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT - s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT - -.exit_trap: - // Restore SQ_WAVE_IB_STS. -.if .amdgcn.gfx_generation_number == 9 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 -.endif -.if .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK - s_or_b32 ttmp2, ttmp2, ttmp3 - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 -.endif - - // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 - - // Return to original (possibly modified) PC. - s_rfe_b64 [ttmp0, ttmp1] -*/ - -static const unsigned int kCodeTrapHandlerV2_9[] = { - 0x92eeff6d, 0x00080010, 0xbf840009, 0xbf0d986d, 0xbf850002, 0x806c846c, - 0x826d806d, 0xbf06836e, 0xbf840003, 0xbf0c9777, 0xbf840001, 0xbf82004c, - 0x8972ff72, 0x3e000000, 0x83ee8f6e, 0x8e6e996e, 0x87726e72, 0x92eeff78, - 0x0001000d, 0x8e6e9d6e, 0x87726e72, 0xbeee007e, 0xbeef007f, 0xbefe00ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, - 0x866fff7e, 0x000003ff, 0xbefe006e, 0xb8eef803, 0xbf0d9c6e, 0xbf840003, - 0x876fff6f, 0x00004000, 0xbf820004, 0xbf0d886e, 0xbf840002, 0x876fff6f, - 0x00008000, 0xbf0d8b6e, 0xbf840002, 0x876fff6f, 0x00002000, 0x866eff6e, - 0x0000007f, 0xbf840006, 0xb8f3f801, 0x8e6e8c6e, 0x866e736e, 0xbf840002, - 0x876fff6f, 0x00001000, 0x92eeff72, 0x00040019, 0xbf06826e, 0xbf840002, - 0x876fff6f, 0x00000400, 0x896eff6f, 0x000003ff, 0xbf850002, 0x876fff6f, - 0x00000800, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, 0xbefc006e, - 0xbef3006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80, - 0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820002, - 0xbf920002, 0xbf82fffe, 0xbef21a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e, - 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, -}; - -static const unsigned int kCodeTrapHandlerV2_1010[] = { - 0x93eeff6d, 0x00080010, 0xbf840009, 0xbf0d986d, 0xbf850002, 0x806c846c, - 0x826d806d, 0xbf06836e, 0xbf840003, 0xbf0c9777, 0xbf840001, 0xbf82004c, - 0x8a72ff72, 0x3e000000, 0x83ee8f6e, 0x8f6e996e, 0x88726e72, 0x93eeff78, - 0x0001000d, 0x8f6e9d6e, 0x88726e72, 0xbeee037e, 0xbeef037f, 0xbefe03ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, - 0x876fff7e, 0x000003ff, 0xbefe036e, 0xb96ef803, 0xbf0d9c6e, 0xbf840003, - 0x886fff6f, 0x00004000, 0xbf820004, 0xbf0d886e, 0xbf840002, 0x886fff6f, - 0x00008000, 0xbf0d8b6e, 0xbf840002, 0x886fff6f, 0x00002000, 0x876eff6e, - 0x0000007f, 0xbf840006, 0xb973f801, 0x8f6e8c6e, 0x876e736e, 0xbf840002, - 0x886fff6f, 0x00001000, 0x93eeff72, 0x00040019, 0xbf06826e, 0xbf840002, - 0x886fff6f, 0x00000400, 0x8a6eff6f, 0x000003ff, 0xbf850002, 0x886fff6f, - 0x00000800, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, 0xbefc036e, - 0xbef3036c, 0x876dff6d, 0x0000ffff, 0x8f6d876d, 0x8a77ff77, 0x007fff80, - 0x88776d77, 0xbeec1f00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820002, - 0xbf920002, 0xbf82fffe, 0xbef21d9e, 0xbef81d8d, 0x906e8977, 0x876fff6e, - 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, 0xb9eef807, - 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, -}; - -static const unsigned int kCodeTrapHandlerV2_10[] = { - 0x93eeff6d, 0x00080010, 0xbf840009, 0xbf0d986d, 0xbf850002, 0x806c846c, - 0x826d806d, 0xbf06836e, 0xbf840003, 0xbf0c9777, 0xbf840001, 0xbf82003f, - 0x8a72ff72, 0x3e000000, 0x83ee8f6e, 0x8f6e996e, 0x88726e72, 0x93eeff78, - 0x0001000d, 0x8f6e9d6e, 0x88726e72, 0xbeee037e, 0xbeef037f, 0xbefe03ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, - 0x876fff7e, 0x000003ff, 0xbefe036e, 0xb96ef803, 0xbf0d9c6e, 0xbf840003, - 0x886fff6f, 0x00004000, 0xbf820004, 0xbf0d886e, 0xbf840002, 0x886fff6f, - 0x00008000, 0xbf0d8b6e, 0xbf840002, 0x886fff6f, 0x00002000, 0x876eff6e, - 0x0000007f, 0xbf840006, 0xb973f801, 0x8f6e8c6e, 0x876e736e, 0xbf840002, - 0x886fff6f, 0x00001000, 0x93eeff72, 0x00040019, 0xbf06826e, 0xbf840002, - 0x886fff6f, 0x00000400, 0x8a6eff6f, 0x000003ff, 0xbf850002, 0x886fff6f, - 0x00000800, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, 0xbefc036e, - 0xbf820000, 0xbef21d9e, 0xbef81d8d, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, - 0xbe80226c, -}; - - -} // namespace amd -} // namespace rocr - -#endif // header guard diff --git a/src/core/inc/amd_hsa_code.hpp b/src/core/inc/amd_hsa_code.hpp index 724100826..08a898cc6 100644 --- a/src/core/inc/amd_hsa_code.hpp +++ b/src/core/inc/amd_hsa_code.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -270,7 +270,7 @@ namespace code { void AddNoteProducerOptions(int32_t call_convention, const hsa_ext_control_directives_t& user_directives, const std::string& user_options); bool GetNoteProducerOptions(std::string& options); - bool GetIsa(std::string& isaName); + bool GetIsa(std::string& isaName, unsigned *genericVersion = nullptr); bool GetCodeObjectVersion(uint32_t* major, uint32_t* minor); hsa_status_t GetInfo(hsa_code_object_info_t attribute, void *value); hsa_status_t GetSymbol(const char *module_name, const char *symbol_name, hsa_code_symbol_t *sym); diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp index f30047d0b..c63b7a961 100644 --- a/src/core/inc/amd_hsa_loader.hpp +++ b/src/core/inc/amd_hsa_loader.hpp @@ -50,6 +50,7 @@ #include "inc/hsa_ven_amd_loader.h" #include "inc/amd_hsa_elf.h" #include +#include #include #include @@ -162,8 +163,12 @@ class Context { virtual hsa_isa_t IsaFromName(const char *name) = 0; + // This function will be deleted in a future patch. Use the overload + // that takes a generic version instead. virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) = 0; + virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa, unsigned genericVersion) { return IsaSupportedByAgent(agent, isa); } + virtual void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) = 0; virtual bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) = 0; @@ -453,6 +458,13 @@ class Loader { const char *options, hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) = 0; + /// @brief Creates empty AMD HSA Executable with specified @p profile, + /// @p options and @p isolated_context that is isolated from the runtime. + virtual Executable* CreateExecutable( + std::unique_ptr isolated_context, + hsa_profile_t profile, + const char *options, + hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) = 0; /// @brief Freezes @p executable virtual hsa_status_t FreezeExecutable(Executable *executable, const char *options) = 0; diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h index da33a655c..adc2d1645 100644 --- a/src/core/inc/amd_memory_region.h +++ b/src/core/inc/amd_memory_region.h @@ -96,7 +96,7 @@ class MemoryRegion : public core::MemoryRegion { static void MakeKfdMemoryUnresident(const void* ptr); MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain, - core::Agent* owner, const HsaMemoryProperties& mem_props); + bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props); ~MemoryRegion(); @@ -193,7 +193,7 @@ class MemoryRegion : public core::MemoryRegion { // fragments of the block routing to the same MemoryRegion. mutable KernelMutex access_lock_; - static const size_t kPageSize_ = 4096; + static size_t kPageSize_; // Determine access type allowed to requesting device hsa_amd_memory_pool_access_t GetAccessInfo(const core::Agent& agent, diff --git a/src/core/inc/checked.h b/src/core/inc/checked.h index 93793bcc7..56497d120 100644 --- a/src/core/inc/checked.h +++ b/src/core/inc/checked.h @@ -58,7 +58,7 @@ template class Check final { Check(const Check&) { object_ = uintptr_t(this) ^ uintptr_t(code); } Check(Check&&) { object_ = uintptr_t(this) ^ uintptr_t(code); } - ~Check() { object_ = NULL; } + ~Check() { object_ = uintptr_t(NULL); } const Check& operator=(Check&& rhs) { return *this; } const Check& operator=(const Check& rhs) { return *this; } diff --git a/src/core/inc/driver.h b/src/core/inc/driver.h new file mode 100644 index 000000000..c6b7ffac1 --- /dev/null +++ b/src/core/inc/driver.h @@ -0,0 +1,109 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTME_CORE_INC_DRIVER_H_ +#define HSA_RUNTME_CORE_INC_DRIVER_H_ + +#include + +#include "core/inc/agent.h" +#include "core/inc/memory_region.h" +#include "inc/hsa.h" + +namespace rocr { +namespace core { + +using MemFlags = uint32_t; + +struct MemProperties { + MemFlags flags_; + size_t size_bytes_; + uint64_t virtual_base_addr_; +}; + +/// @brief Kernel driver interface. +/// +/// @details A class used to provide an interface between the core runtime +/// and agent kernel drivers. It also maintains state associated with active +/// kernel drivers. +class Driver { + public: + Driver() = delete; + Driver(const std::string devnode_name, Agent::DeviceType agent_device_type); + virtual ~Driver() {} + + /// @brief Open a connection to the driver using name_. + /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully. + hsa_status_t Open(); + /// @brief Close a connection to the open driver using fd_. + /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully. + hsa_status_t Close(); + + virtual hsa_status_t GetMemoryProperties(uint32_t node_id, MemProperties &mprops) const = 0; + + /// @brief Allocate agent-accessible memory (system or agent-local memory). + /// + /// @param[out] pointer to newly allocated memory. + /// + /// @retval HSA_STATUS_SUCCESS if memory was successfully allocated or + /// hsa_status_t error code if the memory allocation failed. + virtual hsa_status_t AllocateMemory(void** mem, size_t size, uint32_t node_id, + MemFlags flags) = 0; + + virtual hsa_status_t FreeMemory(void* mem, uint32_t node_id) = 0; + + virtual hsa_status_t CreateQueue(Queue &queue) = 0; + + virtual hsa_status_t DestroyQueue(Queue &queue) const = 0; + + /// Specify the agent device type this driver is for. + const Agent::DeviceType agent_device_type_; + + protected: + const std::string devnode_name_; + int fd_ = -1; +}; + +} // namespace core +} // namespace rocr + +#endif // header guard diff --git a/src/core/inc/host_queue.h b/src/core/inc/host_queue.h index 8521aed7b..ce0bfbbcc 100644 --- a/src/core/inc/host_queue.h +++ b/src/core/inc/host_queue.h @@ -152,10 +152,18 @@ class HostQueue : public Queue { return HSA_STATUS_ERROR_INVALID_QUEUE; } - void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override { + void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, + hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE, + hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE, + hsa_signal_t* signal = NULL) override { assert(false && "HostQueue::ExecutePM4 is unimplemented"); } + hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override { + assert(false && "HostQueue::GetInfo is unimplemented"); + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + void* operator new(size_t size) { return _aligned_malloc(size, HSA_QUEUE_ALIGN_BYTES); } diff --git a/src/core/inc/hsa_api_trace_int.h b/src/core/inc/hsa_api_trace_int.h index f270efb72..e61e8a5db 100644 --- a/src/core/inc/hsa_api_trace_int.h +++ b/src/core/inc/hsa_api_trace_int.h @@ -53,6 +53,7 @@ namespace core { static const uint32_t HSA_EXT_FINALIZER_API_TABLE_ID = 0; static const uint32_t HSA_EXT_IMAGE_API_TABLE_ID = 1; static const uint32_t HSA_EXT_AQLPROFILE_API_TABLE_ID = 2; + static const uint32_t HSA_EXT_PC_SAMPLING_API_TABLE_ID = 3; ::HsaApiTable hsa_api; ::CoreApiTable core_api; @@ -60,6 +61,7 @@ namespace core { ::FinalizerExtTable finalizer_api; ::ImageExtTable image_api; ::ToolsApiTable tools_api; + ::PcSamplingExtTable pcs_api; HsaApiTable(); void Init(); diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h index 19357d2d8..e5717b3fa 100644 --- a/src/core/inc/hsa_ext_amd_impl.h +++ b/src/core/inc/hsa_ext_amd_impl.h @@ -302,6 +302,10 @@ hsa_status_t HSA_API hsa_amd_portable_close_dmabuf(int dmabuf); hsa_status_t hsa_amd_vmem_address_reserve(void** ptr, size_t size, uint64_t address, uint64_t flags); +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_vmem_address_reserve_align(void** ptr, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags); + // Mirrors Amd Extension Apis hsa_status_t hsa_amd_vmem_address_free(void* ptr, size_t size); @@ -349,6 +353,10 @@ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(hsa_amd_vmem_alloc_ha // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold); +// Mirrors Amd Extension Apis +hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, + void* value); + } // namespace amd } // namespace rocr diff --git a/src/core/inc/hsa_ext_interface.h b/src/core/inc/hsa_ext_interface.h index 20a51759f..c6b275b1e 100644 --- a/src/core/inc/hsa_ext_interface.h +++ b/src/core/inc/hsa_ext_interface.h @@ -57,12 +57,17 @@ struct ImageExtTableInternal : public ImageExtTable { decltype(::hsa_amd_image_get_info_max_dim)* hsa_amd_image_get_info_max_dim_fn; }; +struct PcSamplingExtTableInternal : public PcSamplingExtTable {}; + class ExtensionEntryPoints { public: // Table of function pointers for Hsa Extension Image ImageExtTableInternal image_api; + // Table of function pointers for Hsa vendor PC Sampling + PcSamplingExtTableInternal pcs_api; + // Table of function pointers for Hsa Extension Finalizer FinalizerExtTable finalizer_api; @@ -77,6 +82,12 @@ class ExtensionEntryPoints { // Reset Api tables to point to null implementations void UnloadImage(); + // Update PC Sampling Api table with handles to implementation + void LoadPcSampling(); + + // Reset PC Sampling tables to point to null implementations + void UnloadPcSampling(); + private: typedef void (*Load_t)(const ::HsaApiTable* table); typedef void (*Unload_t)(); @@ -89,6 +100,9 @@ class ExtensionEntryPoints { // Initialize table for HSA Image Extension Api's void InitImageExtTable(); + // Initialize table for HSA PC Sampling Extension Api's + void InitPcSamplingExtTable(); + // Initialize Amd Ext table for Api related to Images void InitAmdExtTable(); @@ -96,7 +110,7 @@ class ExtensionEntryPoints { void UpdateAmdExtTable(decltype(::hsa_amd_image_create)* func_ptr); DISALLOW_COPY_AND_ASSIGN(ExtensionEntryPoints); -}; +}; } // namespace core } // namespace rocr diff --git a/src/core/inc/intercept_queue.h b/src/core/inc/intercept_queue.h index e7784ffaa..8088d5e92 100644 --- a/src/core/inc/intercept_queue.h +++ b/src/core/inc/intercept_queue.h @@ -120,8 +120,11 @@ class QueueWrapper : public Queue { hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override { return wrapped->GetCUMasking(num_cu_mask_count, cu_mask); } - void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override { - wrapped->ExecutePM4(cmd_data, cmd_size_b); + void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, + hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE, + hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE, + hsa_signal_t* signal = NULL) override { + wrapped->ExecutePM4(cmd_data, cmd_size_b, acquireFence, releaseFence, signal); } void SetProfiling(bool enabled) override { wrapped->SetProfiling(enabled); } @@ -266,6 +269,9 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi StoreRelaxed(value); } + /// @brief Provide information about the queue + hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override; + static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id_); } static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); } diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h index f36b195e7..66acf3636 100644 --- a/src/core/inc/memory_region.h +++ b/src/core/inc/memory_region.h @@ -58,11 +58,12 @@ class Agent; class MemoryRegion : public Checked<0x9C961F19EE175BB3> { public: MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain, - core::Agent* owner) + bool user_visible, core::Agent* owner) : fine_grain_(fine_grain), kernarg_(kernarg), full_profile_(full_profile), extended_scope_fine_grain_(extended_scope_fine_grain), + user_visible_(user_visible), owner_(owner) { assert(owner_ != NULL); } @@ -103,6 +104,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { // Note: The node_id needs to be the node_id of the device even though this is allocating // system memory AllocateGTTAccess = (1 << 9), + AllocateContiguous = (1 << 10), // Physically contiguous memory }; typedef uint32_t AllocateFlags; @@ -132,6 +134,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { __forceinline bool full_profile() const { return full_profile_; } + __forceinline bool user_visible() const { return user_visible_; } + __forceinline core::Agent* owner() const { return owner_; } private: @@ -139,6 +143,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { const bool kernarg_; const bool full_profile_; const bool extended_scope_fine_grain_; + const bool user_visible_; + core::Agent* owner_; }; } // namespace core diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h index 112510234..75a291a66 100644 --- a/src/core/inc/queue.h +++ b/src/core/inc/queue.h @@ -52,7 +52,7 @@ #include "core/inc/memory_region.h" #include "core/util/utils.h" #include "inc/amd_hsa_queue.h" - +#include "inc/hsa_ext_amd.h" #include "hsakmt/hsakmt.h" namespace rocr { @@ -346,14 +346,33 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { /// @return hsa_status_t virtual hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) = 0; - // @brief Submits a block of PM4 and waits until it has been executed. - virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) = 0; + /// @brief Submits a block of PM4. + /// + /// @param cmd_data pointer to command buffer + /// + /// @param cmd_size_b command buffer size in bytes + /// + /// @param acquireFence acquire-fence type + /// + /// @param releaseFence acquire-fence type + /// + /// @param signal optional wait signal + /// + /// if @p signal is provided, function will return without waiting for commands to be executed + /// if @p signal is NULL, waits until commands have been executed. + virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, + hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE, + hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE, + hsa_signal_t* signal = NULL) = 0; virtual void SetProfiling(bool enabled) { AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, (enabled != 0)); } + /// @ brief Returns queue queries about the queue + virtual hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) = 0; + /// @ brief Reports async queue errors to stderr if no other error handler was registered. static void DefaultErrorHandler(hsa_status_t status, hsa_queue_t* source, void* data); diff --git a/src/core/inc/registers.h b/src/core/inc/registers.h index b8ba6aa5d..2dc30068d 100644 --- a/src/core/inc/registers.h +++ b/src/core/inc/registers.h @@ -134,6 +134,23 @@ SQ_SEL_W = 0x00000007, float f32All; }; + union COMPUTE_TMPRING_SIZE_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 18; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int WAVESIZE : 18; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + union SQ_BUF_RSRC_WORD0 { struct { @@ -301,6 +318,44 @@ SQ_SEL_W = 0x00000007, unsigned int DST_SEL_Z : 3; unsigned int DST_SEL_Y : 3; unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int COMPRESSION_EN : 1; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int COMPRESSION_EN : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; #endif } bitfields, bits; unsigned int u32All; diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index 6f5997f7e..3d4d889d8 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -359,7 +359,7 @@ class Runtime { hsa_status_t DmaBufClose(int dmabuf); - hsa_status_t VMemoryAddressReserve(void** ptr, size_t size, uint64_t address, uint64_t flags); + hsa_status_t VMemoryAddressReserve(void** ptr, size_t size, uint64_t address, uint64_t alignment, uint64_t flags); hsa_status_t VMemoryAddressFree(void* ptr, size_t size); diff --git a/src/core/inc/sdma_registers.h b/src/core/inc/sdma_registers.h index d94eed43f..7a26b7350 100644 --- a/src/core/inc/sdma_registers.h +++ b/src/core/inc/sdma_registers.h @@ -130,7 +130,7 @@ typedef struct SDMA_PKT_COPY_LINEAR_TAG { static const size_t kMaxSize_ = 0x3fffe0; } SDMA_PKT_COPY_LINEAR; -// linear sub-window +// linear sub-window (pre-GFX12) typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG { static const unsigned int pitch_bits = 19; static const unsigned int slice_bits = 28; @@ -253,6 +253,121 @@ typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG { } SDMA_PKT_COPY_LINEAR_RECT; +// linear sub-window (GFX12) +typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG_GFX12 { + static const unsigned int pitch_bits = 16; + static const unsigned int slice_bits = 32; + static const unsigned int rect_xy_bits = 16; + static const unsigned int rect_z_bits = 14; + + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved : 13; + unsigned int element : 3; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int src_addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } SRC_ADDR_LO_UNION; + + union { + struct { + unsigned int src_addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } SRC_ADDR_HI_UNION; + + union { + struct { + unsigned int src_offset_x : 16; + unsigned int src_offset_y : 16; + }; + unsigned int DW_3_DATA; + } SRC_PARAMETER_1_UNION; + + union { + struct { + unsigned int src_offset_z : 14; + unsigned int reserved_1 : 2; + unsigned int src_pitch : pitch_bits; + }; + unsigned int DW_4_DATA; + } SRC_PARAMETER_2_UNION; + + union { + struct { + unsigned int src_slice_pitch : slice_bits; + }; + unsigned int DW_5_DATA; + } SRC_PARAMETER_3_UNION; + + union { + struct { + unsigned int dst_addr_31_0 : 32; + }; + unsigned int DW_6_DATA; + } DST_ADDR_LO_UNION; + + union { + struct { + unsigned int dst_addr_63_32 : 32; + }; + unsigned int DW_7_DATA; + } DST_ADDR_HI_UNION; + + union { + struct { + unsigned int dst_offset_x : 16; + unsigned int dst_offset_y : 16; + }; + unsigned int DW_8_DATA; + } DST_PARAMETER_1_UNION; + + union { + struct { + unsigned int dst_offset_z : 14; + unsigned int reserved_1 : 2; + unsigned int dst_pitch : pitch_bits; + }; + unsigned int DW_9_DATA; + } DST_PARAMETER_2_UNION; + + union { + struct { + unsigned int dst_slice_pitch : slice_bits; + }; + unsigned int DW_10_DATA; + } DST_PARAMETER_3_UNION; + + union { + struct { + unsigned int rect_x : rect_xy_bits; + unsigned int rect_y : rect_xy_bits; + }; + unsigned int DW_11_DATA; + } RECT_PARAMETER_1_UNION; + + union { + struct { + unsigned int rect_z : rect_z_bits; + unsigned int reserved_1 : 6; + unsigned int dst_cache_policy : 3; + unsigned int reserved_2 : 5; + unsigned int src_cache_policy : 3; + unsigned int reserved_3 : 1; + }; + unsigned int DW_12_DATA; + } RECT_PARAMETER_2_UNION; + +} SDMA_PKT_COPY_LINEAR_RECT_GFX12; + typedef struct SDMA_PKT_CONSTANT_FILL_TAG { union { struct { diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h index a1096ddde..39e532186 100644 --- a/src/core/inc/signal.h +++ b/src/core/inc/signal.h @@ -407,7 +407,7 @@ class Signal { core::Agent* async_copy_agent_; private: - static HybridMutex ipcLock_; + static KernelMutex ipcLock_; static std::map ipcMap_; static Signal* lookupIpc(hsa_signal_t signal); diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index c968eec05..4720a6b32 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -68,6 +68,7 @@ #include "core/inc/hsa_ext_amd_impl.h" #include "core/inc/amd_gpu_pm4.h" #include "core/inc/hsa_amd_tool_int.hpp" +#include "core/inc/amd_core_dump.hpp" namespace rocr { namespace AMD { @@ -218,6 +219,12 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr queue_scratch_.mem_alignment_size = 1024; queue_scratch_.use_once_limit = core::Runtime::runtime_singleton_->flag().scratch_single_limit(); + if (queue_scratch_.use_once_limit > agent_->MaxScratchDevice()) { + fprintf(stdout, "User specified scratch limit exceeds device limits (requested:%lu max:%lu)!\n", + queue_scratch_.use_once_limit, agent_->MaxScratchDevice()); + queue_scratch_.use_once_limit = agent_->MaxScratchDevice(); + } + queue_scratch_.use_alt_limit = 0; queue_scratch_.async_reclaim = agent_->AsyncScratchReclaimEnabled(); @@ -358,13 +365,17 @@ AqlQueue::~AqlQueue() { // Remove kfd exception handler exceptionState |= ERROR_HANDLER_TERMINATE; while ((exceptionState & ERROR_HANDLER_DONE) != ERROR_HANDLER_DONE) { + const uint64_t timeout_ms = 5000; + exception_signal_->StoreRelease(-1ull); - exception_signal_->WaitRelaxed(HSA_SIGNAL_CONDITION_NE, -1ull, -1ull, HSA_WAIT_STATE_BLOCKED); + exception_signal_->WaitRelaxed(HSA_SIGNAL_CONDITION_NE, -1ull, timeout_ms, + HSA_WAIT_STATE_BLOCKED); } Inactivate(); - agent_->ReleaseQueueMainScratch(queue_scratch_); - agent_->ReleaseQueueAltScratch(queue_scratch_); + + if (queue_scratch_.main_queue_base) agent_->ReleaseQueueMainScratch(queue_scratch_); + if (queue_scratch_.alt_queue_base) agent_->ReleaseQueueAltScratch(queue_scratch_); FreeRegisteredRingBuffer(); exception_signal_->DestroySignal(); @@ -527,6 +538,25 @@ void AqlQueue::StoreRelease(hsa_signal_value_t value) { StoreRelaxed(value); } +hsa_status_t AqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) { + switch (attribute) { + case HSA_AMD_QUEUE_INFO_AGENT: + *(reinterpret_cast(value)) = agent_->public_handle(); + break; + case HSA_AMD_QUEUE_INFO_DOORBELL_ID: + if (doorbell_type_ == 2) + // Hardware doorbell supports AQL semantics. + *(reinterpret_cast(value)) = + reinterpret_cast(signal_.hardware_doorbell_ptr); + else + return HSA_STATUS_ERROR_INVALID_QUEUE; + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; +} + uint32_t AqlQueue::ComputeRingBufferMinPkts() { // From CP_HQD_PQ_CONTROL.QUEUE_SIZE specification: // Size of the primary queue (PQ) will be: 2^(HQD_QUEUE_SIZE+1) DWs. @@ -1260,6 +1290,21 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) { return false; } + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump && + queue->agent_->isa()->GetMajorVersion() != 11) { + + if (pcs::PcsRuntime::instance()->SessionsActive()) + fprintf(stderr, "GPU core dump skipped because PC Sampling active\n"); + else if (amd::coredump::dump_gpu_core()) + fprintf(stderr, "GPU core dump failed\n"); + // supports_core_dump flag is overwritten to avoid generate core dump file again + // caught by a different exception handler. Such as VMFaultHandler. + core::Runtime::runtime_singleton_->KfdVersion( + core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true); + } + for (auto& error : QueueErrors) { if (error_code & (1 << (error.code - 1))) { errorCode = error.status; @@ -1375,7 +1420,12 @@ void AqlQueue::SetProfiling(bool enabled) { return; } -void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { +// If in_signal is NULL then this ExecutePM4 will block and wait for PM4 commands to complete +// If in_signal is provided, then ExecutePM4 will return and caller may wait for in_signal +// Note: On gfx8, there is no completion signal support, so ExecutePM4 will block even if +// in_signal is provided, and it is still valid to check in_signal after ExecutePM4 returns. +void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence, + hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) { // pm4_ib_buf_ is a shared resource, so mutually exclude here. ScopedAcquire lock(&pm4_ib_mutex_); @@ -1411,7 +1461,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { // To respect multi-producer semantics, first buffer commands for the queue slot. constexpr uint32_t slot_size_dw = uint32_t(slot_size_b / sizeof(uint32_t)); uint32_t slot_data[slot_size_dw]; - hsa_signal_t signal = {0}; + hsa_signal_t local_signal = {0}; hsa_status_t err; if (agent_->isa()->GetMajorVersion() <= 8) { @@ -1456,28 +1506,32 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { // Construct an AQL packet to jump to the PM4 IB. struct amd_aql_pm4_ib { uint16_t header; - uint8_t amd_format; - uint8_t reserved0; + uint16_t ven_hdr; uint32_t ib_jump_cmd[4]; uint32_t dw_cnt_remain; - uint32_t reserved1[8]; + uint32_t reserved[8]; hsa_signal_t completion_signal; }; - constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1; + if (!in_signal) { + err = hsa_signal_create(1, 0, NULL, &local_signal); + assert(err == HSA_STATUS_SUCCESS); + } - err = hsa_signal_create(1, 0, NULL, &signal); - assert(err == HSA_STATUS_SUCCESS); + constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1; amd_aql_pm4_ib aql_pm4_ib{}; - aql_pm4_ib.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - aql_pm4_ib.amd_format = AMD_AQL_FORMAT_PM4_IB; + aql_pm4_ib.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE | + (acquireFence << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | + (releaseFence << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); + + aql_pm4_ib.ven_hdr = AMD_AQL_FORMAT_PM4_IB; aql_pm4_ib.ib_jump_cmd[0] = ib_jump_cmd[0]; aql_pm4_ib.ib_jump_cmd[1] = ib_jump_cmd[1]; aql_pm4_ib.ib_jump_cmd[2] = ib_jump_cmd[2]; aql_pm4_ib.ib_jump_cmd[3] = ib_jump_cmd[3]; aql_pm4_ib.dw_cnt_remain = 0xA; - aql_pm4_ib.completion_signal = signal; + aql_pm4_ib.completion_signal = in_signal ? *in_signal : local_signal; memcpy(slot_data, &aql_pm4_ib, sizeof(aql_pm4_ib)); } else { @@ -1498,11 +1552,14 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) { if (agent_->isa()->GetMajorVersion() <= 8) { while (queue->LoadReadIndexRelaxed() <= write_idx) os::YieldThread(); - } else { + + if (in_signal) hsa_signal_store_screlease(*in_signal, 0); + } else if (!in_signal) { + // On gfx9 and newer, if in_signal is not provided, we block and wait for own signal hsa_signal_value_t ret; - ret = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, - (uint64_t)-1, HSA_WAIT_STATE_ACTIVE); - err = hsa_signal_destroy(signal); + ret = hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_ACTIVE); + err = hsa_signal_destroy(local_signal); assert(ret == 0 && err == HSA_STATUS_SUCCESS); } } @@ -1617,6 +1674,26 @@ void AqlQueue::FillBufRsrcWord3_Gfx11() { amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; } +void AqlQueue::FillBufRsrcWord3_Gfx12() { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.WRITE_COMPRESS_ENABLE = 0; + srd3.bits.COMPRESSION_EN = 0; + srd3.bits.COMPRESSION_ACCESS_MODE = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; +} + // Set concurrent wavefront limits only when scratch is being used. void AqlQueue::FillComputeTmpRingSize() { COMPUTE_TMPRING_SIZE tmpring_size = {}; @@ -1715,10 +1792,52 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() { amd_queue_.compute_tmpring_size = tmpring_size.u32All; } +// Set concurrent wavefront limits only when scratch is being used. +void AqlQueue::FillComputeTmpRingSize_Gfx12() { + // For GFX12, struct field size changes. + // Consider refactoring code for GFX11/GFX12 if no other changes. + COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {}; + if (queue_scratch_.main_size == 0) { + amd_queue_.compute_tmpring_size = tmpring_size.u32All; + return; + } + + const auto& agent_props = agent_->properties(); + const uint32_t num_xcc = agent_props.NumXcc; + + // Determine the maximum number of waves device can support + uint32_t num_cus = agent_props.NumFComputeCores / (agent_props.NumSIMDPerCU * num_xcc); + uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU; + + // Scratch is allocated program COMPUTE_TMPRING_SIZE register + // Scratch Size per Wave is specified in terms of kilobytes + uint32_t wave_scratch = (((queue_scratch_.main_lanes_per_wave * queue_scratch_.main_size_per_thread) + + queue_scratch_.mem_alignment_size - 1) / + queue_scratch_.mem_alignment_size); + + tmpring_size.bits.WAVESIZE = wave_scratch; + assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow."); + + uint32_t num_waves = + queue_scratch_.main_size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size); + + // For GFX11 we specify number of waves per engine instead of total + num_waves /= agent_->properties().NumShaderBanks; + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves); + amd_queue_.compute_tmpring_size = tmpring_size.u32All; +} + // @brief Define the Scratch Buffer Descriptor and related parameters // that enable kernel access scratch memory void AqlQueue::InitScratchSRD() { switch (agent_->isa()->GetMajorVersion()) { + case 12: + FillBufRsrcWord0(); + FillBufRsrcWord1_Gfx11(); + FillBufRsrcWord2(); + FillBufRsrcWord3_Gfx12(); + FillComputeTmpRingSize_Gfx12(); + break; case 11: FillBufRsrcWord0(); FillBufRsrcWord1_Gfx11(); diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp index b39f86461..fe459e13d 100644 --- a/src/core/runtime/amd_blit_sdma.cpp +++ b/src/core/runtime/amd_blit_sdma.cpp @@ -122,6 +122,8 @@ BlitSdma::BlitSdma() cached_commit_index_(0), platform_atomic_support_(true), hdp_flush_support_(false), + gang_leader_(false), + is_ganged_(false), min_submission_size_(0) { std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } @@ -551,7 +553,13 @@ BlitSdma::SubmitCopyRe if (range->z > 1 && (src->slice == 0 || dst->slice == 0)) throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed."); - const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits; + // GFX12 or later use a different packet format that is incompatible (fields changed in size and location). + const bool isGFX12Plus = (agent_->isa()->GetMajorVersion() >= 12); + + // Common and GFX12 packet must match in size to use same code for vector/append. + static_assert(sizeof(SDMA_PKT_COPY_LINEAR_RECT) == sizeof(SDMA_PKT_COPY_LINEAR_RECT_GFX12)); + + const uint max_pitch = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::pitch_bits : SDMA_PKT_COPY_LINEAR_RECT::pitch_bits); std::vector pkts; std::vector bytes_moved; @@ -836,12 +844,15 @@ void BlitSdma::BuildCo return __builtin_ctz(width | 16); }; + // GFX12 or later use a different packet format that is incompatible (fields changed in size and location). + const bool isGFX12Plus = (agent_->isa()->GetMajorVersion() >= 12); + // Limits in terms of element count - const uint32_t max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits; - const uint32_t max_slice = 1 << SDMA_PKT_COPY_LINEAR_RECT::slice_bits; - const uint32_t max_x = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits; - const uint32_t max_y = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits; - const uint32_t max_z = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits; + const uint32_t max_pitch = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::pitch_bits : SDMA_PKT_COPY_LINEAR_RECT::pitch_bits); + const uint32_t max_slice = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::slice_bits : SDMA_PKT_COPY_LINEAR_RECT::slice_bits); + const uint32_t max_x = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::rect_xy_bits : SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits); + const uint32_t max_y = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::rect_xy_bits : SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits); + const uint32_t max_z = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::rect_z_bits : SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits); // Find maximum element that describes the pitch and slice. // Pitch and slice must both be represented in units of elements. No element larger than this @@ -916,27 +927,52 @@ void BlitSdma::BuildCo x += xcount << element; - SDMA_PKT_COPY_LINEAR_RECT* pkt = + // GFX12 has a different packet format that is incompatible with pre-GFX12. + if (isGFX12Plus) { + SDMA_PKT_COPY_LINEAR_RECT_GFX12* pkt = + (SDMA_PKT_COPY_LINEAR_RECT_GFX12*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT)); + *pkt = {}; + pkt->HEADER_UNION.op = SDMA_OP_COPY; + pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT; + pkt->HEADER_UNION.element = element; + pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase; + pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32; + pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff; + pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1; + pkt->SRC_PARAMETER_3_UNION.src_slice_pitch = + (range->z == 1) ? 0 : (src->slice >> element) - 1; + pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase; + pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32; + pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff; + pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1; + pkt->DST_PARAMETER_3_UNION.dst_slice_pitch = + (range->z == 1) ? 0 : (dst->slice >> element) - 1; + pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1; + pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1; + pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1; + } else { // Pre-GFX12, common packet used + SDMA_PKT_COPY_LINEAR_RECT* pkt = (SDMA_PKT_COPY_LINEAR_RECT*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT)); - *pkt = {}; - pkt->HEADER_UNION.op = SDMA_OP_COPY; - pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT; - pkt->HEADER_UNION.element = element; - pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase; - pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32; - pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff; - pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1; - pkt->SRC_PARAMETER_3_UNION.src_slice_pitch = + *pkt = {}; + pkt->HEADER_UNION.op = SDMA_OP_COPY; + pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT; + pkt->HEADER_UNION.element = element; + pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase; + pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32; + pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff; + pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1; + pkt->SRC_PARAMETER_3_UNION.src_slice_pitch = (range->z == 1) ? 0 : (src->slice >> element) - 1; - pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase; - pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32; - pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff; - pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1; - pkt->DST_PARAMETER_3_UNION.dst_slice_pitch = + pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase; + pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32; + pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff; + pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1; + pkt->DST_PARAMETER_3_UNION.dst_slice_pitch = (range->z == 1) ? 0 : (dst->slice >> element) - 1; - pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1; - pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1; - pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1; + pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1; + pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1; + pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1; + } } } } diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp index 7e68c7d23..df473d421 100644 --- a/src/core/runtime/amd_cpu_agent.cpp +++ b/src/core/runtime/amd_cpu_agent.cpp @@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() { if (system_prop != mem_props.end()) system_props = *system_prop; MemoryRegion* system_region_fine = - new MemoryRegion(true, false, is_apu_node, false, this, system_props); + new MemoryRegion(true, false, is_apu_node, false, true, this, system_props); regions_.push_back(system_region_fine); MemoryRegion* system_region_kernarg = - new MemoryRegion(true, true, is_apu_node, false, this, system_props); + new MemoryRegion(true, true, is_apu_node, false, true, this, system_props); regions_.push_back(system_region_kernarg); if (!is_apu_node) { MemoryRegion* system_region_coarse = - new MemoryRegion(false, false, is_apu_node, false, this, system_props); + new MemoryRegion(false, false, is_apu_node, false, true, this, system_props); regions_.push_back(system_region_coarse); } } @@ -152,6 +152,7 @@ hsa_status_t CpuAgent::VisitRegion( hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) const { for (const core::MemoryRegion* region : regions) { + if (!region->user_visible()) continue; hsa_region_t region_handle = core::MemoryRegion::Convert(region); hsa_status_t status = callback(region_handle, data); if (status != HSA_STATUS_SUCCESS) { diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index d7f6a4fdb..ef2ddfb4c 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -65,6 +65,7 @@ #include "core/util/os.h" #include "inc/hsa_ext_image.h" #include "inc/hsa_ven_amd_aqlprofile.h" +#include "inc/hsa_ven_amd_pc_sampling.h" #include "core/inc/amd_trap_handler_v1.h" #include "core/inc/amd_blit_shaders.h" @@ -83,7 +84,6 @@ #define DEFAULT_SCRATCH_BYTES_PER_THREAD 2048 #define MAX_WAVE_SCRATCH 8387584 // See COMPUTE_TMPRING_SIZE.WAVESIZE #define MAX_NUM_DOORBELLS 0x400 -#define MAX_SCRATCH_APERTURE_PER_XCC 4294967296 #define DEFAULT_SCRATCH_SINGLE_LIMIT_ASYNC_PER_XCC (1 << 30) // 1 GB namespace rocr { @@ -92,6 +92,8 @@ extern HsaApiTable hsa_internal_api_table_; } // namespace core namespace AMD { +const uint64_t CP_DMA_DATA_TRANSFER_CNT_MAX = (1 << 26); + GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index) : GpuAgentInt(node), @@ -113,7 +115,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna sdma_blit_used_mask_(0), scratch_limit_async_threshold_(0), scratch_cache_( - [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) { + [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }), + trap_handler_tma_region_(NULL), + pcs_hosttrap_data_() { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -259,68 +263,76 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_1010; ASICShader compute_10; ASICShader compute_11; + ASICShader compute_12; }; std::map compiled_shaders = { {"TrapHandler", { - {NULL, 0, 0, 0}, // gfx7 - {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 - {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9 - {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a - {NULL, 0, 0, 0}, // gfx940 - {NULL, 0, 0, 0}, // gfx942 - {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010 - {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10 - {NULL, 0, 0, 0}, // gfx11 + {NULL, 0, 0, 0}, // gfx7 + {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 + {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9 + {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a + {NULL, 0, 0, 0}, // gfx940 + {NULL, 0, 0, 0}, // gfx942 + {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010 + {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10 + {NULL, 0, 0, 0}, // gfx11 + // GFX12_TODO: Using one for GFX10 for now. + // If NULL is used (like GFX11), get an assert. + {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx12 }}, {"TrapHandlerKfdExceptions", { - {NULL, 0, 0, 0}, // gfx7 - {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 - {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9 - {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a - {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940 - {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942 - {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4}, // gfx1010 - {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10 - {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11 + {NULL, 0, 0, 0}, // gfx7 + {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 + {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9 + {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a + {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940 + {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942 + {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010 + {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10 + {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11 + {kCodeTrapHandlerV2_12, sizeof(kCodeTrapHandlerV2_12), 2, 4}, // gfx12 }}, {"CopyAligned", { - {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7 - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8 - {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx9 - {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx90a - {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940 - {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx942 - {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010 - {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10 - {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11 + {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7 + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8 + {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx9 + {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx90a + {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940 + {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx942 + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010 + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10 + {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11 + {kCodeCopyAligned12, sizeof(kCodeCopyAligned12), 32, 12}, // gfx12 }}, {"CopyMisaligned", { - {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7 - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8 - {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx9 - {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx90a - {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10}, // gfx940 - {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx942 - {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010 - {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10 - {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11 + {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7 + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8 + {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx9 + {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx90a + {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940 + {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx942 + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010 + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10 + {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11 + {kCodeCopyMisaligned12, sizeof(kCodeCopyMisaligned12), 23, 10}, // gfx12 }}, {"Fill", { - {kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7 - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8 - {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx9 - {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx90a - {kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940 - {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx942 - {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010 - {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10 - {kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11 + {kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7 + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8 + {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx9 + {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx90a + {kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940 + {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx942 + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010 + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10 + {kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11 + {kCodeFill12, sizeof(kCodeFill12), 19, 8}, // gfx12 }}}; auto compiled_shader_it = compiled_shaders.find(func_name); @@ -363,6 +375,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar case 11: asic_shader = &compiled_shader_it->second.compute_11; break; + case 12: + asic_shader = &compiled_shader_it->second.compute_12; + break; default: assert(false && "Precompiled shader unavailable for target"); } @@ -449,19 +464,22 @@ void GpuAgent::InitRegionList() { case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { MemoryRegion* region = - new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]); + new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]); regions_.push_back(region); if (region->IsLocalMemory()) { - regions_.push_back( - new MemoryRegion(false, false, false, true, this, mem_props[mem_idx])); - // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. - if ((properties_.HiveID != 0) || - (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) { + // Extended Fine-Grain memory + if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0)) regions_.push_back( - new MemoryRegion(true, false, false, false, this, mem_props[mem_idx])); - } + new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx])); + + // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. + bool user_visible = (properties_.HiveID != 0) || + core::Runtime::runtime_singleton_->flag().fine_grain_pcie(); + + regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this, + mem_props[mem_idx])); } break; } @@ -502,10 +520,9 @@ void GpuAgent::InitScratchPool() { size_t max_scratch_len = queue_scratch_len_ * max_queues_; #if defined(HSA_LARGE_MODEL) && defined(__linux__) - const size_t max_scratch_device = properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC; // For 64-bit linux use max queues unless otherwise specified - if ((max_scratch_len == 0) || (max_scratch_len > max_scratch_device)) { - max_scratch_len = max_scratch_device; // 4GB per XCC aperture max + if ((max_scratch_len == 0) || (max_scratch_len > MaxScratchDevice())) { + max_scratch_len = MaxScratchDevice(); // 4GB per XCC aperture max } #endif @@ -536,6 +553,12 @@ void GpuAgent::InitAsyncScratchThresholds() { void GpuAgent::ReserveScratch() { size_t reserved_sz = core::Runtime::runtime_singleton_->flag().scratch_single_limit(); + if (reserved_sz > MaxScratchDevice()) { + fprintf(stdout, "User specified scratch limit exceeds device limits (requested:%lu max:%lu)!\n", + reserved_sz, MaxScratchDevice()); + reserved_sz = MaxScratchDevice(); + } + size_t available; HSAKMT_STATUS err = hsaKmtAvailableMemory(node_id(), &available); assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAvailableMemory failed"); @@ -646,6 +669,8 @@ hsa_status_t GpuAgent::VisitRegion( void* data) const { AMD::callback_t call(callback); for (const core::MemoryRegion* region : regions) { + if (!region->user_visible()) continue; + const AMD::MemoryRegion* amd_region = reinterpret_cast(region); @@ -687,9 +712,8 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) { break; case 9: sdma = new BlitSdmaV4(); - copy_size_override = (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 10) || - isa_->GetMinorVersion() > 0 ? copy_size_overrides[1] : - copy_size_overrides[0]; + copy_size_override = (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 10) ? + copy_size_overrides[1] : copy_size_overrides[0]; break; case 10: sdma = new BlitSdmaV5(); @@ -697,6 +721,7 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) { copy_size_overrides[1]; break; case 11: + case 12: sdma = new BlitSdmaV5(); copy_size_override = copy_size_overrides[1]; break; @@ -732,18 +757,29 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { void GpuAgent::InitDma() { // Setup lazy init pointers on queues and blits. - auto queue_lambda = [this]() { - auto ret = CreateInterceptibleQueue(); - if (ret == nullptr) + auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) { + auto queue = CreateInterceptibleQueue(); + if (queue == nullptr) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Internal queue creation failed."); - return ret; + + if (priority != HSA_QUEUE_PRIORITY_NORMAL) + if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, + "Failed to increase queue priority for PC Sampling"); + return queue; }; + // Dedicated compute queue for host-to-device blits. queues_[QueueBlitOnly].reset(queue_lambda); // Share utility queue with device-to-host blits. queues_[QueueUtility].reset(queue_lambda); + // Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at + // highest priority because we do not want the CP-DMA commands to be delayed/blocked due to + // other dispatches/barriers that could be in the other AQL queues. + queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); }); + // Decide which engine to use for blits. auto blit_lambda = [this](bool use_xgmi, lazy_ptr& queue, bool isHostToDev) { Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma(); @@ -846,7 +882,7 @@ void GpuAgent::PreloadBlits() { hsa_status_t GpuAgent::PostToolsInit() { // Defer memory allocation until agents have been discovered. - InitNumaAllocator(); + InitAllocators(); InitScratchPool(); BindTrapHandler(); InitDma(); @@ -922,7 +958,6 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, } ScopedAcquire lock(&sdma_gang_lock_); - if (gang_factor == 1) sdma_gang_lock_.Release(); // Manage internal gang signals std::vector gang_signals; if (gang_factor > 1) { @@ -1170,16 +1205,8 @@ void GpuAgent::GetInfoMemoryProperties(uint8_t value[8]) const { }; // Fill the HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU flag - switch (properties_.DeviceId) { - case 0x15DD: /* gfx902 - Raven Ridge */ - case 0x15D8: /* gfx909 - Raven Ridge 2 */ - case 0x1636: /* gfx90c - Renoir */ - case 0x74A0: /* gfx940 and gfx942-APU */ + if (properties_.Integrated) setFlag(HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU); - break; - default: - break; - } } hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { @@ -1302,6 +1329,10 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { setFlag(HSA_EXTENSION_IMAGES); } + if (core::hsa_internal_api_table_.pcs_api.hsa_ven_amd_pcs_iterate_configuration_fn != NULL) { + setFlag(HSA_EXTENSION_AMD_PC_SAMPLING); + } + if (os::LibHandle lib = os::LoadLib(kAqlProfileLib)) { os::CloseLib(lib); setFlag(HSA_EXTENSION_AMD_AQLPROFILE); @@ -1900,8 +1931,7 @@ void GpuAgent::AsyncReclaimScratchQueues() { } hsa_status_t GpuAgent::SetAsyncScratchThresholds(size_t use_once_limit) { - if (use_once_limit > properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC) - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if (use_once_limit > MaxScratchDevice()) return HSA_STATUS_ERROR_INVALID_ARGUMENT; scratch_limit_async_threshold_ = use_once_limit; @@ -2042,6 +2072,58 @@ void GpuAgent::SyncClocks() { assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error"); } +hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* pcs_stochastic_buffers) { + // Assemble the trap handler source code. + void* tma_addr = nullptr; + uint64_t tma_size = 0; + + assert(core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging); + + AssembleShader("TrapHandlerKfdExceptions", AssembleTarget::ISA, trap_code_buf_, + trap_code_buf_size_); + + /* pcs_hosttrap_buffers and pcs_stochastic_buffers are NULL until PC sampling is enabled */ + if (pcs_hosttrap_buffers || pcs_stochastic_buffers) { + // ON non-large BAR systems, we cannot access device memory so we create a host copy + // and then do a DmaCopy to device memory + void* tma_region_host = (uint64_t*)system_allocator()(2 * sizeof(void*), 0x1000, 0); + if (tma_region_host == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + MAKE_SCOPE_GUARD([&]() { system_deallocator()(tma_region_host); }); + + ((uint64_t*)tma_region_host)[0] = (uint64_t)pcs_hosttrap_buffers; + ((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers; + + if (!trap_handler_tma_region_) { + trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(void*), 0); + if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // NearestCpuAgent owns pool returned system_allocator() + auto cpuAgent = GetNearestCpuAgent()->public_handle(); + + hsa_status_t ret = + AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_); + assert(ret == HSA_STATUS_SUCCESS); + } + + /* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */ + if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(void*)) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + tma_size = 2 * sizeof(void*); + tma_addr = trap_handler_tma_region_; + } else if (trap_handler_tma_region_) { + finegrain_deallocator()(trap_handler_tma_region_); + trap_handler_tma_region_ = NULL; + } + + // Bind the trap handler to this node. + HSAKMT_STATUS retKmt = + hsaKmtSetTrapHandler(node_id(), trap_code_buf_, trap_code_buf_size_, tma_addr, tma_size); + + return (retKmt != HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; +} + void GpuAgent::BindTrapHandler() { if (isa_->GetMajorVersion() == 7) { // No trap handler support on Gfx7, soft error. @@ -2096,7 +2178,7 @@ void GpuAgent::InvalidateCodeCaches() { // Microcode is handling code cache invalidation. return; } - } else if (isa_->GetMajorVersion() > 11) { + } else if (isa_->GetMajorVersion() > 12) { assert(false && "Code cache invalidation not implemented for this agent"); } @@ -2238,7 +2320,7 @@ void GpuAgent::Trim() { scratch_cache_.trim(false); } -void GpuAgent::InitNumaAllocator() { +void GpuAgent::InitAllocators() { for (auto pool : GetNearestCpuAgent()->regions()) { if (pool->kernarg()) { system_allocator_ = [pool](size_t size, size_t alignment, @@ -2252,11 +2334,29 @@ void GpuAgent::InitNumaAllocator() { }; system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; + } + } + assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool."); - return; + // Setup fine-grain allocator + for (auto region : regions()) { + const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region; + if (amd_region->IsLocalMemory() && amd_region->fine_grain()) { + finegrain_allocator_ = [region](size_t size, + MemoryRegion::AllocateFlags alloc_flags) -> void* { + void* ptr = nullptr; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr)) + ? ptr + : nullptr; + }; + + finegrain_deallocator_ = [](void* ptr) { + core::Runtime::runtime_singleton_->FreeMemory(ptr); + }; } } - assert(false && "Nearest NUMA node did not have a kernarg pool."); + assert(finegrain_deallocator_ && "Agent does not have a fine-grain allocator"); } core::Agent* GpuAgent::GetNearestCpuAgent() const { @@ -2273,5 +2373,808 @@ core::Agent* GpuAgent::GetNearestCpuAgent() const { return nearCpu; } +hsa_status_t ConvertHsaKmtPcSamplingInfoToHsa(HsaPcSamplingInfo* hsaKmtPcSampling, + hsa_ven_amd_pcs_configuration_t* hsaPcSampling) { + assert(hsaKmtPcSampling && "Invalid hsaKmtPcSampling"); + assert(hsaPcSampling && "Invalid hsaPcSampling"); + + switch (hsaKmtPcSampling->method) { + case HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1: + hsaPcSampling->method = HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1; + break; + case HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1: + hsaPcSampling->method = HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1; + break; + default: + // Sampling method not supported do not return this method to the user + return HSA_STATUS_ERROR; + } + switch (hsaKmtPcSampling->units) { + case HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS: + hsaPcSampling->units = HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS; + break; + case HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES: + hsaPcSampling->units = HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES; + break; + case HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS: + hsaPcSampling->units = HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS; + break; + default: + // Sampling unit not supported do not return this method to the user + return HSA_STATUS_ERROR; + } + + hsaPcSampling->min_interval = hsaKmtPcSampling->value_min; + hsaPcSampling->max_interval = hsaKmtPcSampling->value_max; + hsaPcSampling->flags = hsaKmtPcSampling->flags; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, + void* cb_data) { + uint32_t size = 0; + + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) + return HSA_STATUS_ERROR; + + // First query to get size of list needed + HSAKMT_STATUS ret = hsaKmtPcSamplingQueryCapabilities(node_id(), NULL, 0, &size); + if (ret != HSAKMT_STATUS_SUCCESS || size == 0) return HSA_STATUS_ERROR; + + std::vector sampleInfoList(size); + ret = hsaKmtPcSamplingQueryCapabilities(node_id(), sampleInfoList.data(), sampleInfoList.size(), + &size); + + if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR; + + for (uint32_t i = 0; i < size; i++) { + hsa_ven_amd_pcs_configuration_t hsaPcSampling; + if (ConvertHsaKmtPcSamplingInfoToHsa(&sampleInfoList[i], &hsaPcSampling) == HSA_STATUS_SUCCESS + && cb(&hsaPcSampling, cb_data) == HSA_STATUS_INFO_BREAK) + return HSA_STATUS_SUCCESS; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) { + hsa_status_t ret; + HsaPcSamplingInfo sampleInfo = {}; + HsaPcSamplingTraceId thunkId; + + // IOCTL id does not exist at the moment, so passing 0 is OK, + // since it will be overridden later in this function. + ret = PcSamplingCreateFromId(0, session); + if (ret != HSA_STATUS_SUCCESS) return ret; + + session.GetHsaKmtSamplingInfo(&sampleInfo); + HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId); + if (retkmt != HSAKMT_STATUS_SUCCESS) { + return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY + : HSA_STATUS_ERROR; + } + + debug_print("Created PC sampling session with thunkId:%d\n", thunkId); + + session.SetThunkId(thunkId); + + return ret; +} + +hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId, + pcs::PcsRuntime::PcSamplingSession& session) { + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + // TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be + // able to support multiple sessions at a time. But this makes the session.HandleSampleData more + // complicated if multiple sessions have different buffer sizes. + if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // This is current amd_aql_queue->pm4_ib_size_b_ + ht_data.cmd_data_sz = 0x1000; + ht_data.cmd_data = (uint32_t*)malloc(ht_data.cmd_data_sz); + assert(ht_data.cmd_data); + + if (HSA::hsa_signal_create(1, 0, NULL, &ht_data.exec_pm4_signal) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + ht_data.old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0); + assert(ht_data.old_val); + + if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.old_val)) + return HSA_STATUS_ERROR; + + // Local copy of hosttrap data - we cannot access device memory directly on non-large BAR + // systems + pcs_hosttrap_sampling_data_t* device_datahost = + (pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0); + if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); }); + + memset(device_datahost, 0, sizeof(*device_datahost)); + + if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) != + HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() { + if (ht_data.device_data) { + if (ht_data.device_data->done_sig0.handle) + HSA::hsa_signal_destroy(ht_data.device_data->done_sig0); + if (ht_data.device_data->done_sig1.handle) + HSA::hsa_signal_destroy(ht_data.device_data->done_sig1); + + finegrain_deallocator()(ht_data.device_data); + } + if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer); + }); + + // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble + // CP queues on this agent + queues_[QueuePCSampling].touch(); + + /* + * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW). + * The maximum indirect buffer size we need occurs when we enqueue the + * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops: + * For WAIT_REG_MEM = 7 DW + * For each DMA_COPY = 7 DW + * For WRITE_DATA_CMD = 6 DW + * + * So maximum number of DMA_COPY ops is: + * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY) + * (1024 - 7 - 6) / 7 = 144 + * + * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit + * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size. + * Set current limit to 256 MB to limit device VRAM usage + */ + const size_t max_trap_buffer_size = + core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size(); + + /* + * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer + * Warning: This currently assumes that client latency is smaller than time to fill 1 + * trap-buffer If latency is bigger, we have to increate host-buffer + * + * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each + * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of + * trap-buffers + * + * if client-buffer size is greater than 2x max_trap_buffer_size: + * We are limited by max_trap_buffer_size. + * trap-buffer = max-trap-buffer-size + * host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer + * else: + * We reduce the trap-buffers so that: + * trap-buffer = half of user-buffer + * host-buffer = 2*user-buffer + * + * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to + * factor in client latency. Using a direct-copy to the client buffer would be more efficient. + * Revisit this once we have empirical data of latency vs how long it takes to fill 1 + * trap-buffer. + */ + + size_t trap_buffer_size = 0; + if (session.buffer_size() > 2 * max_trap_buffer_size) { + trap_buffer_size = max_trap_buffer_size; + ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size); + } else { + trap_buffer_size = session.buffer_size() / 2; + ht_data.host_buffer_size = 2 * session.buffer_size(); + } + + ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0); + if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) != + HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + device_datahost->buf_size = trap_buffer_size / session.sample_size(); + + if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig0) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig1) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // TODO: Once we have things working and can measure + // latency after 2nd level trap handler decrements signals and set watermark accordingly + device_datahost->buf_watermark0 = 0.8 * device_datahost->buf_size; + device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size; + + // Allocate device memory for 2nd level trap handler TMA + size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size); + ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0); + if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // This cpuAgent is the owner of the system_allocator() pool + auto cpuAgent = GetNearestCpuAgent()->public_handle(); + hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data); + assert(ret == HSA_STATUS_SUCCESS); + + if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) != + HSA_STATUS_SUCCESS) { + debug_print("Failed to dmaCopy!\n"); + return HSA_STATUS_ERROR; + } + + uint8_t* device_buf_ptr = + ((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t); + if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) != + HSA_STATUS_SUCCESS) { + debug_print("Failed to dmaFill!\n"); + return HSA_STATUS_ERROR; + } + + ht_data.lost_sample_count = 0; + ht_data.host_buffer_wrap_pos = 0; + ht_data.host_write_ptr = ht_data.host_buffer; + ht_data.host_read_ptr = ht_data.host_write_ptr; + + ht_data.session = &session; + freeHostTrapResources.Dismiss(); + + if (UpdateTrapHandlerWithPCS(ht_data.device_data, NULL) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; + } + + session.SetThunkId(ioctlId); + ht_data.session = &session; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) { + if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; + + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId()); + ht_data.session = NULL; + + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + free(ht_data.cmd_data); + system_deallocator()(ht_data.old_val); + HSA::hsa_signal_destroy(ht_data.exec_pm4_signal); + HSA::hsa_signal_destroy(ht_data.device_data->done_sig0); + HSA::hsa_signal_destroy(ht_data.device_data->done_sig1); + finegrain_deallocator()(ht_data.device_data); + system_deallocator()(ht_data.host_buffer); + + ht_data.device_data = NULL; + ht_data.host_buffer = NULL; + ht_data.session = NULL; + + UpdateTrapHandlerWithPCS(NULL, NULL); + } + return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; +} + +hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) { + if (session.isActive()) return HSA_STATUS_SUCCESS; + + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + auto method = session.method(); + if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + if (ht_data.session->isActive()) { + debug_warning("Already have a Host trap session in progress!"); + return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY; + } + ht_data.session->start(); + // This thread will handle all hosttrap sessions on this agent + // In the future, there will be another thread to handle stochastic sessions. + ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this); + if (!ht_data.thread) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + "Failed to start PC Sampling thread."); + } + + if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_SUCCESS; + + debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId()); + if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + ht_data.session->stop(); + os::WaitForThread(ht_data.thread); + os::CloseThread(ht_data.thread); + ht_data.thread = NULL; + } + + return HSA_STATUS_ERROR; +} + +hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) { + if (!session.isActive()) return HSA_STATUS_SUCCESS; + + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + session.stop(); + + HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId()); + if (retKmt != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session."); + + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + // Wake up pcs_hosttrap_thread_ if it is waiting for data + HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1); + HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1); + + os::WaitForThread(ht_data.thread); + os::CloseThread(ht_data.thread); + ht_data.thread = NULL; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers( + pcs::PcsRuntime::PcSamplingSession& session) { + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + uint32_t& which_buffer = ht_data.which_buffer; + uint32_t* cmd_data = ht_data.cmd_data; + size_t& cmd_data_sz = ht_data.cmd_data_sz; + uint64_t* old_val = ht_data.old_val; + hsa_signal_t& exec_pm4_signal = ht_data.exec_pm4_signal; + + /* + * Device-buffer to Host-buffer to User-Buffer copy logic + * + * Device-buffer = buffer written by 2nd level trap handler + * Host-buffer = buffer inside ROCr + * User-buffer = Session buffer size specified in PCSamplingSessionCreate + * + * Conditions for the buffer sizes: + * Host buffer is at least 2 times bigger than device buffer and Host buffer + * is also at least 2 times bigger than User-Buffer. + * + * Key: + * Device-Buffer[==--][----] : Device-Buffer#1 has size 4*N, and is half-full + * Device-Buffer#2 has size 4*N and is empty + * + * Host-Buffer[=---------] : Host Buffer has size 10*N and is filled with N. + * + * N will vary based on the User-buffer size, this example is to show the + * relative sizes between each copy. + * + * 1. Initial state + * - User has created a new session with buffer size = 7*N + * + * Device-Buffer[---][---] + * Host-Buffer[--------------] wptr=0 rptr=0 wrap_pos=0 + * User-Buffer[-------] + * + * -- Device Buffer has size 3*N + * -- Host-Buffer has size 14*N (2x User-Buffer) + * -- User-Buffer has size 7*N + * + * 2. Device Buffer#1 hits watermark + * State at beginning: + * Device-Buffer[===][---] + * Host-Buffer[--------------] + * User-Buffer[-------] + * + * -- Copy 3*N from Device-Buffer#1 to Host-Buffer + * -- In the meantime, 2nd level trap handler is writing to Device-Buffer#2 + * -- We do not have enough data to fill User-Buffer + * + * State at end: + * Device-Buffer[---][=--] + * Host-Buffer[===-----------] wptr=3 rptr=0, wrap_pos=0 + * User-Buffer[-------] + * + * 3. Device Buffer#2 hits watermark + * State at beginning: + * Device-Buffer[---][===] + * Host-Buffer[===-----------] + * User-Buffer[-------] + * + * -- Copy 3*N from Device-Buffer#2 to Host-Buffer + * -- In the meantime, 2nd level trap handler is writing to Device-Buffer#1 + * -- We do not have enough data to fill User-Buffer + * + * State at end: + * Device-Buffer[=--][---] + * Host-Buffer[======--------] wptr=6 rptr=0 wrap_pos=0 + * User-Buffer[-------] + * + * 4. Device Buffer#1 hits watermark + * State at beginning: + * Device-Buffer[---][===] + * Host-Buffer[======--------] + * User-Buffer[-------] + * + * -- Copy 3*N from Device-Buffer#2 to Host-Buffer + * -- In the meantime, 2nd level trap handler is writing to Device-Buffer#1 + * + * Device-Buffer[=--][---] + * Host-Buffer[=========-----] + * User-Buffer[-------] + * + * -- We have enough data to fill User-Buffer. Callback user data-ready to + * -- copy 7*N to user. + * + * Device-Buffer[=--][---] + * Host-Buffer[-------==-----] + * User-Buffer[=======] + * + * -- User processes User-Buffer + * + * Device-Buffer[=--][---] + * Host-Buffer[-------==-----] wptr=9 rptr=7 wrap_pos=0 + * User-Buffer[-------] + * + * 6. Device Buffer#1 hits watermark + * State at end: + * Device-Buffer[---][=--] + * Host-Buffer[-------=====--] wptr=12 rptr=7 wrap_pos=0 + * User-Buffer[-------] + * + * 7. Device Buffer#2 hits watermark + * State at beginning: + * Device-Buffer[---][===] + * Host-Buffer[-------=====--] wptr=12 rptr=7 wrap_pos=0 + * User-Buffer[-------] + * + * -- We do not have enough space after wptr. The CP-DMA copy + * -- can only copy a contiguous range, so copy to the + * -- beginning of Host-Buffer and set wrap_pos + * + * Device-Buffer[=--][---] + * Host-Buffer[===----=====--] wptr=3 rptr=7 wrap_pos=12 + * User-Buffer[-------] + * + * -- We have enough data to fill User-Buffer. Callback user data-ready to + * -- copy 7*N to user. We copy the tail end (index 7-12) of Host-Buffer + * -- before copying the beginning of Host-Buffer (index 0-2). + * + * Device-Buffer[=--][---] + * Host-Buffer[--=-----------] wptr=3 rptr=2 wrap_pos=0 + * User-Buffer[=======] + * + * -- User processes User-Buffer + * + * 8. Device Buffer#1 hits watermark + * State at end: + * Device-Buffer[---][=--] + * Host-Buffer[--====--------] wptr=6 rptr=2 wrap_pos=0 + * User-Buffer[-------] + */ + + uint32_t next_buffer; + + uint64_t reset_write_val; + uint32_t to_copy, copy_bytes; + + const uint32_t atomic_ex_cmd_sz = 9; + const uint32_t wait_reg_mem_cmd_sz = 7; + const uint32_t dma_data_cmd_sz = 7; + const uint32_t copy_data_cmd_sz = 6; + const uint32_t write_data_cmd_sz = 5; + + uint8_t* host_buffer_begin = ht_data.host_buffer; + uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size; + + uint64_t buf_write_val = (uint64_t) & (ht_data.device_data->buf_write_val); + uint64_t buf_written_val[] = {(uint64_t) & (ht_data.device_data->buf_written_val0), + (uint64_t) & (ht_data.device_data->buf_written_val1)}; + + size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) + + sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1); + + hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1}; + uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset, + (uint8_t*)ht_data.device_data + buf_offset + + ht_data.device_data->buf_size * session.sample_size()}; + + next_buffer = (which_buffer + 1) % 2; + reset_write_val = (uint64_t)next_buffer << 63; + + /* + * ATOMIC_MEM, perform atomic_exchange + * We use a double-buffer mechanism so that trap handlers calls are writing to one buffer while + * hsa-runtime is copying data from the other buffer. + * + * 1. Atomically swap buffers on the device. Future trap handler calls will put their data into + * next_buffer. + * 2. Return a 64-bit packed value to ROCr; the upper bit is the old buffer and can be ignored. + * The lower 63 bits are how many trap handler entrances happened before the atomic swap + * i.e., what value to wait for in buf_written_val to know all previous trap entries were + * done. + */ + unsigned int i = 0; + memset(cmd_data, 0, cmd_data_sz); + cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_ATOMIC_MEM, atomic_ex_cmd_sz, isa_->GetMajorVersion()); + cmd_data[i++] = PM4_ATOMIC_MEM_DW1_ATOMIC(PM4_ATOMIC_MEM_GL2_OP_ATOMIC_SWAP_RTN_64); + cmd_data[i++] = PM4_ATOMIC_MEM_DW2_ADDR_LO(buf_write_val); + cmd_data[i++] = PM4_ATOMIC_MEM_DW3_ADDR_HI((buf_write_val) >> 32); + cmd_data[i++] = PM4_ATOMIC_MEM_DW4_SRC_DATA_LO((uint64_t)reset_write_val); + cmd_data[i++] = PM4_ATOMIC_MEM_DW5_SRC_DATA_HI(((uint64_t)reset_write_val) >> 32); + i += 3; + /* copy data */ + cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_COPY_DATA, copy_data_cmd_sz, isa_->GetMajorVersion()); + cmd_data[i++] = + PM4_COPY_DATA_DW1(PM4_COPY_DATA_SRC_SEL_ATOMIC_RETURN_DATA | PM4_COPY_DATA_DST_SEL_TC_12 | + PM4_COPY_DATA_COUNT_SEL | PM4_COPY_DATA_WR_CONFIRM); + i += 2; + cmd_data[i++] = PM4_COPY_DATA_DW4_DST_ADDR_LO((uint64_t)old_val); + cmd_data[i++] = PM4_COPY_DATA_DW5_DST_ADDR_HI(((uint64_t)old_val) >> 32); + + HSA::hsa_signal_store_screlease(exec_pm4_signal, 1); + + queues_[QueuePCSampling]->ExecutePM4( + cmd_data, (atomic_ex_cmd_sz + copy_data_cmd_sz) * sizeof(uint32_t), HSA_FENCE_SCOPE_NONE, + HSA_FENCE_SCOPE_SYSTEM, &exec_pm4_signal); + do { + hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire( + exec_pm4_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + if (val == -1) return HSA_STATUS_SUCCESS; + if (val == 0) break; + } while (true); + + *old_val &= (ULLONG_MAX >> 1); + /* If the number of entries in old_val is larger than buf_size, then there was a buffer overflow + * and the 2nd level trap handler code will skip recording samples, causing lost samples + */ + if (*old_val > (uint64_t)ht_data.device_data->buf_size) { + ht_data.lost_sample_count = *old_val - (uint64_t)ht_data.device_data->buf_size; + *old_val = (uint64_t)ht_data.device_data->buf_size; + } + + to_copy = *old_val * session.sample_size(); + + /* Make sure there is enough space after host_write_ptr */ + if (ht_data.host_write_ptr + to_copy >= host_buffer_end) { + // Need to wrap around + ht_data.host_buffer_wrap_pos = ht_data.host_write_ptr; + ht_data.host_write_ptr = host_buffer_begin; + } + /* + * Do the WAIT_REG_MEM, DMA_DATA(s) and WRITE_DATA + * + * 1. Wait for all trap handlers have finished writing values to this buffer by waiting for + * buf_written_val to equal to old_val. + * 2. Copy the values out of buffer to the host buffers. + * 3. Reset buf_written_val so that we start writing to beginning of this buffer on the next + * buffer swap. + */ + i = 0; + memset(cmd_data, 0, cmd_data_sz); + + /* WAIT_REG_MEM, wait on buf_written_val */ + cmd_data[i++] = + PM4_HDR(PM4_HDR_IT_OPCODE_WAIT_REG_MEM, wait_reg_mem_cmd_sz, isa_->GetMajorVersion()); + cmd_data[i++] = PM4_WAIT_REG_MEM_DW1(PM4_WAIT_REG_MEM_FUNCTION_EQUAL_TO_REFERENCE | + PM4_WAIT_REG_MEM_MEM_SPACE_MEMORY_SPACE | + PM4_WAIT_REG_MEM_OPERATION_WAIT_REG_MEM); + cmd_data[i++] = PM4_WAIT_REG_MEM_DW2_MEM_POLL_ADDR_LO(buf_written_val[which_buffer]); + cmd_data[i++] = PM4_WAIT_REG_MEM_DW3_MEM_POLL_ADDR_HI((buf_written_val[which_buffer]) >> 32); + cmd_data[i++] = PM4_WAIT_REG_MEM_DW4_REFERENCE(*old_val); + cmd_data[i++] = 0xFFFFFFFF; + cmd_data[i++] = PM4_WAIT_REG_MEM_DW6(PM4_WAIT_REG_MEM_POLL_INTERVAL(4) | + PM4_WAIT_REG_MEM_OPTIMIZE_ACE_OFFLOAD_MODE); + + unsigned int num_copy_command = 0; + uint8_t* buffer_temp = buffer[which_buffer]; + for (copy_bytes = CP_DMA_DATA_TRANSFER_CNT_MAX; 0 < to_copy; to_copy -= copy_bytes) { + num_copy_command++; + + /* DMA_DATA PACKETS, copy buffer using CPDMA */ + cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_DMA_DATA, dma_data_cmd_sz, isa_->GetMajorVersion()); + cmd_data[i++] = PM4_DMA_DATA_DW1(PM4_DMA_DATA_DST_SEL_DST_ADDR_USING_L2 | + PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2); + cmd_data[i++] = PM4_DMA_DATA_DW2_SRC_ADDR_LO((uint64_t)buffer_temp); + cmd_data[i++] = PM4_DMA_DATA_DW3_SRC_ADDR_HI(((uint64_t)buffer_temp) >> 32); + cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)ht_data.host_write_ptr); + cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)ht_data.host_write_ptr) >> 32); + + if (copy_bytes >= to_copy) { + copy_bytes = to_copy; + cmd_data[i++] = + PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC_LAST); + } else { + cmd_data[i++] = PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC); + } + + buffer_temp += copy_bytes; + ht_data.host_write_ptr += copy_bytes; + } + + /* WRITE_DATA, Reset buf_written_val */ + cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_WRITE_DATA, write_data_cmd_sz, isa_->GetMajorVersion()); + cmd_data[i++] = PM4_WRITE_DATA_DW1(PM4_WRITE_DATA_DST_SEL_TC_L2 | + PM4_WRITE_DATA_WR_CONFIRM_WAIT_CONFIRMATION); + cmd_data[i++] = PM4_WRITE_DATA_DW2_DST_MEM_ADDR_LO(buf_written_val[which_buffer]); + cmd_data[i++] = PM4_WRITE_DATA_DW3_DST_MEM_ADDR_HI((buf_written_val[which_buffer]) >> 32); + cmd_data[i++] = PM4_WRITE_DATA_DW4_DATA(0); + + unsigned int cmd_sz = + wait_reg_mem_cmd_sz + (num_copy_command * dma_data_cmd_sz) + write_data_cmd_sz; + + HSA::hsa_signal_store_screlease(exec_pm4_signal, 1); + queues_[QueuePCSampling]->ExecutePM4(cmd_data, cmd_sz * sizeof(uint32_t), HSA_FENCE_SCOPE_NONE, + HSA_FENCE_SCOPE_SYSTEM, &exec_pm4_signal); + do { + hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire( + exec_pm4_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + if (val == -1) return HSA_STATUS_SUCCESS; + if (val == 0) break; + } while (true); + + which_buffer = next_buffer; + + return HSA_STATUS_SUCCESS; +} + +void GpuAgent::PcSamplingThread() { + // TODO: Implement lost sample count + // TODO: Implement latency + + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + pcs::PcsRuntime::PcSamplingSession& session = *ht_data.session; + uint32_t& which_buffer = ht_data.which_buffer; + + hsa_status_t ret = HSA_STATUS_SUCCESS; + uint8_t* host_buffer_begin = ht_data.host_buffer; + uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size; + + size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) + + sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1); + + hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1}; + uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset, + (uint8_t*)ht_data.device_data + buf_offset + + ht_data.device_data->buf_size * session.sample_size()}; + + while (ht_data.session->isActive()) { + do { + hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire( + done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + if (val == -1) goto thread_exit; + if (val == 0) break; + } while (true); + HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1); + + std::lock_guard lock(ht_data.host_buffer_mutex); + if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS) + goto thread_exit; + + size_t bytes_before_wrap; + size_t bytes_after_wrap; + + assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end); + assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end); + assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr) + : (ht_data.host_read_ptr <= ht_data.host_write_ptr)); + + if (ht_data.host_buffer_wrap_pos) { + assert(ht_data.host_buffer_wrap_pos <= host_buffer_end && + ht_data.host_buffer_wrap_pos > host_buffer_begin); + assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos); + + // Wrapped around + bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; + bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin; + + while (bytes_before_wrap >= session.buffer_size()) { + session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0, + ht_data.lost_sample_count); + ht_data.host_read_ptr += session.buffer_size(); + bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + + if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) { + session.HandleSampleData(ht_data.host_read_ptr, bytes_before_wrap, host_buffer_begin, + (session.buffer_size() - bytes_before_wrap), 0); + ht_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap); + bytes_before_wrap = 0; + ht_data.host_buffer_wrap_pos = 0; + bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + + while (bytes_after_wrap >= session.buffer_size()) { + session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0, + ht_data.lost_sample_count); + ht_data.host_read_ptr += session.buffer_size(); + bytes_before_wrap = 0; + bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + } else { + bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + + while (bytes_before_wrap >= session.buffer_size()) { + assert(ht_data.host_read_ptr >= host_buffer_begin && + ht_data.host_read_ptr + session.buffer_size() < host_buffer_end); + session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0, + ht_data.lost_sample_count); + ht_data.host_read_ptr += session.buffer_size(); + bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + } + } +thread_exit: + debug_print("PcSamplingThread::Exiting\n"); +} + +void GpuAgent::PcSamplingThreadRun(void* _agent) { + GpuAgent* agent = (GpuAgent*)_agent; + agent->PcSamplingThread(); + debug_print("PcSamplingThread exiting..."); +} + +hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) { + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + uint8_t* host_buffer_begin = ht_data.host_buffer; + uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size; + + size_t bytes_before_wrap; + size_t bytes_after_wrap; + + std::lock_guard lock(ht_data.host_buffer_mutex); + if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end); + assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end); + assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr) + : (ht_data.host_read_ptr <= ht_data.host_write_ptr)); + + if (ht_data.host_buffer_wrap_pos) { + assert(ht_data.host_buffer_wrap_pos <= host_buffer_end && + ht_data.host_buffer_wrap_pos > host_buffer_begin); + assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos); + + // Wrapped around + bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; + bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin; + + while (bytes_before_wrap > 0) { + size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size()); + + session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0, + ht_data.lost_sample_count); + ht_data.host_read_ptr += bytes_to_copy; + bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + + assert(ht_data.host_read_ptr == ht_data.host_buffer_wrap_pos); + ht_data.host_buffer_wrap_pos = 0; + ht_data.host_read_ptr = host_buffer_begin; + + while (bytes_after_wrap > 0) { + size_t bytes_to_copy = std::min(bytes_after_wrap, session.buffer_size()); + + session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0, + ht_data.lost_sample_count); + ht_data.host_read_ptr += bytes_to_copy; + bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + } else { + bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + + while (bytes_before_wrap) { + size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size()); + assert(ht_data.host_read_ptr >= host_buffer_begin && + ht_data.host_read_ptr + bytes_to_copy <= host_buffer_end); + + session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0, + ht_data.lost_sample_count); + ht_data.host_read_ptr += bytes_to_copy; + bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr; + ht_data.lost_sample_count = 0; + } + } + return HSA_STATUS_SUCCESS; +} + } // namespace amd } // namespace rocr diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index f664102fd..7d38a83b4 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -50,12 +50,14 @@ #include "core/inc/amd_gpu_agent.h" #include "core/util/utils.h" #include "core/inc/exceptions.h" +#include namespace rocr { namespace AMD { // Tracks aggregate size of system memory available on platform size_t MemoryRegion::max_sysmem_alloc_size_ = 0; +size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE); void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) { void* ret = NULL; @@ -100,9 +102,10 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { } MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, - bool extended_scope_fine_grain, core::Agent* owner, + bool extended_scope_fine_grain, bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props) - : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, owner), + : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, user_visible, + owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0), @@ -136,7 +139,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, virtual_size_ = kGpuVmSize; } else if (IsSystem()) { - mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; + mem_flag_.ui32.PageSize = MemoryRegion::kPageSize_; mem_flag_.ui32.NoSubstitute = 0; mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; @@ -161,7 +164,6 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, } assert(GetVirtualSize() != 0); - assert(GetPhysicalSize() <= GetVirtualSize()); assert(IsMultipleOf(max_single_alloc_size_, kPageSize_)); } @@ -207,6 +209,12 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute); kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess); + if (IsLocalMemory()) { + // Allocate physically contiguous memory - AllocateKfdMemory function call will fail + // if this flag is not supported in KFD. + kmt_alloc_flags.ui32.Contiguous = + (alloc_flags & AllocateContiguous ? 1 : kmt_alloc_flags.ui32.Contiguous); + } // Only allow using the suballocator for ordinary VRAM. if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) { @@ -237,9 +245,9 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size); } - if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS; - if (*address != nullptr) { + if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS; + // Commit the memory. // For system memory, on non-restricted allocation, map it to all GPUs. On // restricted allocation, only CPU is allowed to access by default, so diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp index 89c5cd260..e595bffac 100644 --- a/src/core/runtime/amd_topology.cpp +++ b/src/core/runtime/amd_topology.cpp @@ -367,6 +367,17 @@ void BuildTopology() { } const_cast(core::Runtime::runtime_singleton_->flag()).parse_masks(maxGpu, maxCu); + // Temporary work-around, disable SDMA ganging on non-APUs in non-SPX modes + // Check xGMI APU status + bool isXgmiApu = false; + auto& firstCpu = core::Runtime::runtime_singleton_->cpu_agents()[0]; + for (auto& peer_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { + auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(firstCpu->node_id(), + peer_gpu->node_id()); + isXgmiApu = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI; + if (isXgmiApu) break; + } + // Register destination agents that can SDMA gang copy for source agents for (auto& src_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { uint32_t src_id = src_gpu->node_id(); @@ -383,7 +394,7 @@ void BuildTopology() { // Weight of 41 - Inter-socket GPU link in multi-partition mode if (linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { if (linfo.info.numa_distance == 13 || linfo.info.numa_distance == 41) - gang_factor = 2; + gang_factor = isXgmiApu ? 2 : 1; else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth) gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth; else gang_factor = 1; diff --git a/src/core/runtime/blit_shaders/CMakeLists.txt b/src/core/runtime/blit_shaders/CMakeLists.txt index dc32b2f2b..e63d380b8 100644 --- a/src/core/runtime/blit_shaders/CMakeLists.txt +++ b/src/core/runtime/blit_shaders/CMakeLists.txt @@ -49,9 +49,10 @@ find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) # Set the target devices -set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100") +set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100;gfx1200") + # Set the postfix for each target device -set (POSTFIX "9;940;1010;10;11") +set (POSTFIX "9;940;1010;10;11;12") # If verbose output is enabled, print paths and target devices if(${CMAKE_VERBOSE_MAKEFILE}) diff --git a/src/core/runtime/blit_shaders/blit_copyAligned.s b/src/core/runtime/blit_shaders/blit_copyAligned.s index 750366ff6..c861147aa 100644 --- a/src/core/runtime/blit_shaders/blit_copyAligned.s +++ b/src/core/runtime/blit_shaders/blit_copyAligned.s @@ -146,8 +146,12 @@ compute_pgm_rsrc1_vgprs = CopyAlignedRsrc1VGPRs s_load_dword s24, s[0:1], 0x50 s_waitcnt lgkmcnt(0) - + .if (.amdgcn.gfx_generation_number == 12) + s_lshl_b32 s2, ttmp9, 0x6 + .else s_lshl_b32 s2, s2, 0x6 + .endif + V_ADD_CO_U32 v0, s2, v0 v_mov_b32 v3, s5 diff --git a/src/core/runtime/blit_shaders/blit_copyMisaligned.s b/src/core/runtime/blit_shaders/blit_copyMisaligned.s index a63b2ace5..48a5b3ec3 100644 --- a/src/core/runtime/blit_shaders/blit_copyMisaligned.s +++ b/src/core/runtime/blit_shaders/blit_copyMisaligned.s @@ -117,7 +117,12 @@ CopyMisaligned: s_load_dword s16, s[0:1], 0x30 s_waitcnt lgkmcnt(0) - s_lshl_b32 s2, s2, 0x6 + .if (.amdgcn.gfx_generation_number == 12) + s_lshl_b32 s2, ttmp9, 0x6 + .else + s_lshl_b32 s2, s2, 0x6 + .endif + V_ADD_CO_U32 v0, s2, v0 v_mov_b32 v3, s5 diff --git a/src/core/runtime/blit_shaders/blit_fill.s b/src/core/runtime/blit_shaders/blit_fill.s index bdc4fbcc5..752499b4f 100644 --- a/src/core/runtime/blit_shaders/blit_fill.s +++ b/src/core/runtime/blit_shaders/blit_fill.s @@ -117,7 +117,12 @@ Fill: s_load_dwordx4 s[8:11], s[0:1], 0x10 s_waitcnt lgkmcnt(0) - s_lshl_b32 s2, s2, 0x6 + .if (.amdgcn.gfx_generation_number == 12) + s_lshl_b32 s2, ttmp9, 0x6 + .else + s_lshl_b32 s2, s2, 0x6 + .endif + V_ADD_CO_U32 v0, s2, v0 .macro mFillPattern iter iter_end diff --git a/src/core/runtime/default_signal.cpp b/src/core/runtime/default_signal.cpp index 820fc75ca..b3e5a23f2 100644 --- a/src/core/runtime/default_signal.cpp +++ b/src/core/runtime/default_signal.cpp @@ -57,7 +57,7 @@ int BusyWaitSignal::rtti_id_ = 0; BusyWaitSignal::BusyWaitSignal(SharedSignal* abi_block, bool enableIPC) : Signal(abi_block, enableIPC) { signal_.kind = AMD_SIGNAL_KIND_USER; - signal_.event_mailbox_ptr = NULL; + signal_.event_mailbox_ptr = uint64_t(NULL); } hsa_signal_value_t BusyWaitSignal::LoadRelaxed() { diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index c509fda5e..8ad8ff264 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -343,7 +343,8 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin {"hsa_ven_amd_loader_1_01_pfn_t", sizeof(hsa_ven_amd_loader_1_01_pfn_t)}, {"hsa_ven_amd_loader_1_02_pfn_t", sizeof(hsa_ven_amd_loader_1_02_pfn_t)}, {"hsa_ven_amd_loader_1_03_pfn_t", sizeof(hsa_ven_amd_loader_1_03_pfn_t)}, - {"hsa_ven_amd_aqlprofile_1_00_pfn_t", sizeof(hsa_ven_amd_aqlprofile_1_00_pfn_t)}}; + {"hsa_ven_amd_aqlprofile_1_00_pfn_t", sizeof(hsa_ven_amd_aqlprofile_1_00_pfn_t)}, + {"hsa_ven_amd_pc_sampling_1_00_pfn_t", sizeof(hsa_ven_amd_pc_sampling_1_00_pfn_t)}}; static const size_t num_tables = sizeof(sizes) / sizeof(sizes_t); if (minor > 99) return 0; @@ -372,6 +373,9 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin case HSA_EXTENSION_AMD_AQLPROFILE: name = "hsa_ven_amd_aqlprofile_"; break; + case HSA_EXTENSION_AMD_PC_SAMPLING: + name = "hsa_ven_amd_pc_sampling_"; + break; default: return 0; } @@ -429,6 +433,21 @@ hsa_status_t hsa_system_get_major_extension_table(uint16_t extension, uint16_t v return HSA_STATUS_SUCCESS; } + if (extension == HSA_EXTENSION_AMD_PC_SAMPLING) { + if (version_major != core::Runtime::runtime_singleton_->extensions_.pcs_api.version.major_id) { + return HSA_STATUS_ERROR; + } + hsa_ven_amd_pc_sampling_1_00_pfn_t ext_table; + ext_table.hsa_ven_amd_pcs_create = hsa_ven_amd_pcs_create; + ext_table.hsa_ven_amd_pcs_create_from_id = hsa_ven_amd_pcs_create_from_id; + ext_table.hsa_ven_amd_pcs_destroy = hsa_ven_amd_pcs_destroy; + ext_table.hsa_ven_amd_pcs_start = hsa_ven_amd_pcs_start; + ext_table.hsa_ven_amd_pcs_stop = hsa_ven_amd_pcs_stop; + ext_table.hsa_ven_amd_pcs_flush = hsa_ven_amd_pcs_flush; + + memcpy(table, &ext_table, Min(sizeof(ext_table), table_length)); + } + if (extension == HSA_EXTENSION_FINALIZER) { if (version_major != core::Runtime::runtime_singleton_->extensions_.finalizer_api.version.major_id) { @@ -2195,6 +2214,7 @@ hsa_status_t hsa_executable_create_alt( IS_BAD_PTR(executable); Executable *exec = GetLoader()->CreateExecutable( + std::unique_ptr(new amd::LoaderContext()), profile, options, default_float_rounding_mode); CHECK_ALLOC(exec); diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp index 0c3ba59a7..48dee4858 100644 --- a/src/core/runtime/hsa_api_trace.cpp +++ b/src/core/runtime/hsa_api_trace.cpp @@ -80,10 +80,11 @@ void HsaApiTable::Init() { // they can add preprocessor macros on the new functions constexpr size_t expected_core_api_table_size = 1016; - constexpr size_t expected_amd_ext_table_size = 560; + constexpr size_t expected_amd_ext_table_size = 576; constexpr size_t expected_image_ext_table_size = 120; constexpr size_t expected_finalizer_ext_table_size = 64; constexpr size_t expected_tools_table_size = 64; + constexpr size_t expected_pc_sampling_ext_table_size = 72; static_assert(sizeof(CoreApiTable) == expected_core_api_table_size, "HSA core API table size changed, bump HSA_CORE_API_TABLE_STEP_VERSION and set " @@ -101,6 +102,9 @@ void HsaApiTable::Init() { static_assert(sizeof(ToolsApiTable) == expected_tools_table_size, "HSA tools table size changed, bump HSA_TOOLS_API_TABLE_STEP_VERSION " "and set expected_tools_table_size to the new size of the struct"); + static_assert(sizeof(PcSamplingExtTable) == expected_pc_sampling_ext_table_size, + "HSA finalizer ext table size changed, bump HSA_PC_SAMPLING_API_TABLE_STEP_VERSION " + "and set expected_pc_sampling_ext_table_size to the new size of the struct"); // Initialize Version of Api Table hsa_api.version.major_id = HSA_API_TABLE_MAJOR_VERSION; @@ -120,6 +124,7 @@ void HsaApiTable::Init() { // of Hsa Runtime initialization, including their major ids hsa_api.finalizer_ext_ = NULL; hsa_api.image_ext_ = NULL; + hsa_api.pc_sampling_ext_ = NULL; UpdateTools(); hsa_api.tools_ = &tools_api; @@ -146,6 +151,13 @@ void HsaApiTable::CloneExts(void* ext_table, uint32_t table_id) { hsa_api.image_ext_ = &image_api; return; } + + // Update HSA Extension PC Sampling Api table + if (table_id == HSA_EXT_PC_SAMPLING_API_TABLE_ID) { + pcs_api = *reinterpret_cast(ext_table); + hsa_api.pc_sampling_ext_ = &pcs_api; + return; + } } void HsaApiTable::LinkExts(void* ext_table, uint32_t table_id) { @@ -165,6 +177,13 @@ void HsaApiTable::LinkExts(void* ext_table, uint32_t table_id) { hsa_api.image_ext_ = reinterpret_cast(ext_table); return; } + + // Update HSA Extension PC Sampling Api table + if (table_id == HSA_EXT_PC_SAMPLING_API_TABLE_ID) { + pcs_api = *reinterpret_cast(ext_table); + hsa_api.pc_sampling_ext_ = &pcs_api; + return; + } } // Update Api table for Hsa Core Runtime @@ -432,6 +451,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_portable_export_dmabuf_fn = AMD::hsa_amd_portable_export_dmabuf; amd_ext_api.hsa_amd_portable_close_dmabuf_fn = AMD::hsa_amd_portable_close_dmabuf; amd_ext_api.hsa_amd_vmem_address_reserve_fn = AMD::hsa_amd_vmem_address_reserve; + amd_ext_api.hsa_amd_vmem_address_reserve_align_fn = AMD::hsa_amd_vmem_address_reserve_align; amd_ext_api.hsa_amd_vmem_address_free_fn = AMD::hsa_amd_vmem_address_free; amd_ext_api.hsa_amd_vmem_handle_create_fn = AMD::hsa_amd_vmem_handle_create; amd_ext_api.hsa_amd_vmem_handle_release_fn = AMD::hsa_amd_vmem_handle_release; @@ -445,6 +465,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_vmem_get_alloc_properties_from_handle_fn = AMD::hsa_amd_vmem_get_alloc_properties_from_handle; amd_ext_api.hsa_amd_agent_set_async_scratch_limit_fn = AMD::hsa_amd_agent_set_async_scratch_limit; + amd_ext_api.hsa_amd_queue_get_info_fn = AMD::hsa_amd_queue_get_info; } void HsaApiTable::UpdateTools() { diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index 7b5689852..ce8d9256c 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -761,7 +761,7 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz TRY; IS_OPEN(); - if (size == 0 || ptr == NULL || (flags > HSA_AMD_MEMORY_POOL_PCIE_FLAG)) { + if (size == 0 || ptr == NULL) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -774,7 +774,11 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz MemoryRegion::AllocateFlags alloc_flag = core::MemoryRegion::AllocateRestrict; - if (flags == HSA_AMD_MEMORY_POOL_PCIE_FLAG) alloc_flag |= core::MemoryRegion::AllocatePCIeRW; + if (flags & HSA_AMD_MEMORY_POOL_PCIE_FLAG) + alloc_flag |= core::MemoryRegion::AllocatePCIeRW; + + if (flags & HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG) + alloc_flag |= core::MemoryRegion::AllocateContiguous; #ifdef SANITIZER_AMDGPU alloc_flag |= core::MemoryRegion::AllocateAsan; @@ -1072,10 +1076,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue, core::Queue* cmd_queue = core::Queue::Convert(queue); IS_VALID(cmd_queue); + // Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH + // HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally + // in ROCR static std::map ext_kmt_priomap = { {HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM}, {HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL}, - {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_MAXIMUM}, + {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH}, }; auto priority_it = ext_kmt_priomap.find(priority); @@ -1224,10 +1231,21 @@ hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t addre IS_OPEN(); IS_ZERO(size); IS_TRUE(core::Runtime::runtime_singleton_->VirtualMemApiSupported()); - return core::Runtime::runtime_singleton_->VMemoryAddressReserve(va, size, address, flags); + return core::Runtime::runtime_singleton_->VMemoryAddressReserve(va, size, address, 0, flags); + CATCH; +} + +hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags) { + TRY; + IS_OPEN(); + IS_ZERO(size); + IS_TRUE(core::Runtime::runtime_singleton_->VirtualMemApiSupported()); + return core::Runtime::runtime_singleton_->VMemoryAddressReserve(va, size, address, alignment, flags); CATCH; } + hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size) { TRY; IS_OPEN(); @@ -1385,5 +1403,17 @@ hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t _agent, s CATCH; } +hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* _queue, + hsa_queue_info_attribute_t attribute, void* value) { + TRY; + IS_OPEN(); + + core::Queue* queue = core::Queue::Convert(_queue); + IS_VALID(queue); + + return queue->GetInfo(attribute, value); + CATCH; +} + } // namespace amd } // namespace rocr diff --git a/src/core/runtime/hsa_ext_interface.cpp b/src/core/runtime/hsa_ext_interface.cpp index 2931b2b54..d872e485b 100644 --- a/src/core/runtime/hsa_ext_interface.cpp +++ b/src/core/runtime/hsa_ext_interface.cpp @@ -41,6 +41,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "image/inc/hsa_ext_image_impl.h" +#include "pcs/inc/hsa_ven_amd_pc_sampling_impl.h" #include "core/inc/hsa_ext_interface.h" #include "core/inc/runtime.h" @@ -56,6 +57,7 @@ namespace core { ExtensionEntryPoints::ExtensionEntryPoints() { InitFinalizerExtTable(); InitImageExtTable(); + InitPcSamplingExtTable(); InitAmdExtTable(); } @@ -99,6 +101,22 @@ void ExtensionEntryPoints::InitImageExtTable() { image_api.hsa_ext_image_create_with_layout_fn = hsa_ext_null; } +// Initialize PC Sampling function table to be NULLs +void ExtensionEntryPoints::InitPcSamplingExtTable() { + // Initialize Version of Api Table + pcs_api.version.major_id = 0x00; + pcs_api.version.minor_id = 0x00; + pcs_api.version.step_id = 0x00; + + pcs_api.hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ext_null; + pcs_api.hsa_ven_amd_pcs_create_fn = hsa_ext_null; + pcs_api.hsa_ven_amd_pcs_create_from_id_fn = hsa_ext_null; + pcs_api.hsa_ven_amd_pcs_destroy_fn = hsa_ext_null; + pcs_api.hsa_ven_amd_pcs_start_fn = hsa_ext_null; + pcs_api.hsa_ven_amd_pcs_stop_fn = hsa_ext_null; + pcs_api.hsa_ven_amd_pcs_flush_fn = hsa_ext_null; +} + // Initialize Amd Ext table for Api related to Images void ExtensionEntryPoints::InitAmdExtTable() { hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null; @@ -131,6 +149,9 @@ void ExtensionEntryPoints::UnloadImage() { void ExtensionEntryPoints::Unload() { // Reset Image apis to hsa_ext_null function UnloadImage(); +#ifdef HSA_PC_SAMPLING_SUPPORT + rocr::pcs::ReleasePcSamplingRsrcs(); +#endif for (auto lib : libs_) { void* ptr = os::GetExportAddress(lib, "Unload"); @@ -148,6 +169,7 @@ void ExtensionEntryPoints::Unload() { libs_.clear(); InitFinalizerExtTable(); + InitPcSamplingExtTable(); InitImageExtTable(); InitAmdExtTable(); core::hsa_internal_api_table_.Reset(); @@ -180,6 +202,23 @@ bool ExtensionEntryPoints::LoadImage() { return true; } +void ExtensionEntryPoints::LoadPcSampling() { +#ifdef HSA_PC_SAMPLING_SUPPORT + if (core::Runtime::runtime_singleton_->flag().disable_pc_sampling()) return; + + // Bind to Image implementation api's + rocr::pcs::LoadPcSampling(&pcs_api); + + // Initialize Version of Api Table + pcs_api.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION; + pcs_api.version.minor_id = sizeof(PcSamplingExtTable); + pcs_api.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; + + // Update private copy of Api table with handle for Image extensions + hsa_internal_api_table_.CloneExts(&pcs_api, core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID); +#endif +} + bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) { os::LibHandle lib = os::LoadLib(library_name); if (lib == NULL) { @@ -429,6 +468,54 @@ hsa_status_t hsa_ext_image_create_with_layout( image); } +hsa_status_t HSA_API hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api + .hsa_ven_amd_pcs_iterate_configuration_fn(agent, configuration_callback, callback_data); +} + +hsa_status_t HSA_API hsa_ven_amd_pcs_create( + hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, + size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_create_fn( + agent, method, units, interval, latency, buffer_size, data_ready_callback, + client_callback_data, pc_sampling); +} + +hsa_status_t HSA_API hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api + .hsa_ven_amd_pcs_create_from_id_fn(pcs_id, agent, method, units, interval, latency, + buffer_size, data_ready_callback, client_callback_data, + pc_sampling); +} + +hsa_status_t HSA_API hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_destroy_fn( + pc_sampling); +} + +hsa_status_t HSA_API hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_start_fn( + pc_sampling); +} + +hsa_status_t HSA_API hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_stop_fn( + pc_sampling); +} + +hsa_status_t HSA_API hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling) { + return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_flush_fn( + pc_sampling); +} + //---------------------------------------------------------------------------// // Stubs for internal extension functions //---------------------------------------------------------------------------// diff --git a/src/core/runtime/intercept_queue.cpp b/src/core/runtime/intercept_queue.cpp index 7f82965ee..47598bb95 100644 --- a/src/core/runtime/intercept_queue.cpp +++ b/src/core/runtime/intercept_queue.cpp @@ -41,6 +41,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "core/inc/intercept_queue.h" +#include "core/inc/amd_aql_queue.h" #include "core/util/utils.h" #include "inc/hsa_api_trace.h" @@ -386,5 +387,18 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) { atomic::Store(&amd_queue_.read_dispatch_id, next_packet_, std::memory_order_release); } +hsa_status_t InterceptQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) { + switch (attribute) { + case HSA_AMD_QUEUE_INFO_AGENT: + case HSA_AMD_QUEUE_INFO_DOORBELL_ID: { + if (!AMD::AqlQueue::IsType(wrapped.get())) return HSA_STATUS_ERROR_INVALID_QUEUE; + + AMD::AqlQueue* aqlQueue = static_cast(wrapped.get()); + return aqlQueue->GetInfo(attribute, value); + } + } + return HSA_STATUS_ERROR_INVALID_ARGUMENT; +} + } // namespace core } // namespace rocr diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index c87cbc71f..e3d02d3c3 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -349,6 +349,8 @@ constexpr size_t hsa_name_size = 63; ISAREG_ENTRY_GEN("gfx1103", 11, 0, 3, unsupported, unsupported, 32) ISAREG_ENTRY_GEN("gfx1150", 11, 5, 0, unsupported, unsupported, 32) ISAREG_ENTRY_GEN("gfx1151", 11, 5, 1, unsupported, unsupported, 32) + ISAREG_ENTRY_GEN("gfx1200", 12, 0, 0, unsupported, unsupported, 32) + ISAREG_ENTRY_GEN("gfx1201", 12, 0, 1, unsupported, unsupported, 32) #undef ISAREG_ENTRY_GEN return supported_isas; } diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index ceae7333e..9ccef65cc 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -73,6 +73,7 @@ #include "core/util/os.h" #include "core/inc/exceptions.h" #include "inc/hsa_ven_amd_aqlprofile.h" +#include "core/inc/amd_core_dump.hpp" #ifndef HSA_VERSION_MAJOR #define HSA_VERSION_MAJOR 1 @@ -1077,7 +1078,8 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) { int connection_fd; char buf[IPC_SOCK_SERVER_DMABUF_FD_HANDLE_LENGTH]; - std::map openDmaBufs; + // openDmaBufs pair is + std::map> openDmaBufs; // Wait until the client has connected while (1) { connection_fd = accept(ipc_sock_server_fd_, NULL, NULL); @@ -1095,9 +1097,31 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) { void *baseAddr = NULL; size_t memLen = 0; - ScopedAcquire lock(&ipc_sock_server_lock_); bool isClose = !!(IPC_SOCK_SERVER_CONN_CLOSE_BIT & conn_handle); + bool isAlreadyOpen = false; conn_handle &= ~(IPC_SOCK_SERVER_CONN_CLOSE_BIT); + + // send dmabufs that are already opened + for (auto&conns : openDmaBufs) { + if (conn_handle == conns.first) { + if (!isClose) { + SendDmaBufFd(connection_fd, openDmaBufs[conn_handle].first); + openDmaBufs[conn_handle].second++; + } else { + openDmaBufs[conn_handle].second--; + if (!openDmaBufs[conn_handle].second) { + close(openDmaBufs[conn_handle].first); + openDmaBufs.erase(conn_handle); + } + } + isAlreadyOpen = true; + break; + } + } + + if (isAlreadyOpen) continue; + + ScopedAcquire lock(&ipc_sock_server_lock_); for (auto& conns : ipc_sock_server_conns_) { if (conn_handle == conns.first) { baseAddr = conns.second.first; @@ -1105,20 +1129,16 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) { break; } } - if (!isClose) { - // we can ignore a bad export since importer will catch the bad fd - hsaKmtExportDMABufHandle(baseAddr, memLen, &dmabuf_fd, &fragOffset); - SendDmaBufFd(connection_fd, dmabuf_fd); - openDmaBufs[conn_handle] = dmabuf_fd; - } else { - close(openDmaBufs[conn_handle]); - openDmaBufs.erase(conn_handle); - } + + HSAKMT_STATUS err = hsaKmtExportDMABufHandle(baseAddr, memLen, &dmabuf_fd, &fragOffset); + if (err != HSAKMT_STATUS_SUCCESS) continue; + SendDmaBufFd(connection_fd, dmabuf_fd); + openDmaBufs[conn_handle] = std::make_pair(dmabuf_fd, 1); } // Clean up for (auto& conns : openDmaBufs) - close(conns.second); // close all dangling open dmabuf FDs + close(conns.second.first); // close all dangling open dmabuf FDs ipc_sock_server_conns_.clear(); close(ipc_sock_server_fd_); } @@ -1186,6 +1206,17 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han // System sub allocations are not supported for now. if (handle->handle[3] && useFrag) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + // Work around to defer export on import call to minimize FD creation. + // Without this, a deferred export may fail due to the kernel mode driver not + // holding the GEM object reference. + // Export the dmabuf then close the file to get the reference to ensure the + // deferred export will not run into this problem. + int dmabuf_fd; + uint64_t fragOffset; + HSAKMT_STATUS err = hsaKmtExportDMABufHandle(baseAddr, memLen, &dmabuf_fd, &fragOffset); + if (err != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR; + close(dmabuf_fd); + ScopedAcquire lock(&ipc_sock_server_lock_); if (!ipc_sock_server_conns_.size()) { // create new runtime socket server struct sockaddr_un address; @@ -1250,6 +1281,12 @@ static int GetIPCDmaBufFD(uint32_t conn_handle, uint64_t dmabuf_fd_handle, bool assert(socket_fd > -1 && "DMA buffer could not be imported for IPC!"); if (socket_fd == -1) return -1; + // Set 10 second timeout for ReceiveDmaBufFd + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + setsockopt(socket_fd, SOL_SOCKET, SO_RCVTIMEO, (const char*)&tv, sizeof tv); + char buf[IPC_SOCK_SERVER_DMABUF_FD_HANDLE_LENGTH]; memset(&address, 0, sizeof(struct sockaddr_un)); memset(buf, 0, sizeof(buf)); @@ -1678,6 +1715,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { hsa_status_t custom_handler_status = HSA_STATUS_ERROR; auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers(); + Agent* faulty_agent = nullptr; // If custom handler is registered, pack the fault info and call the handler if (!system_event_handlers.empty()) { hsa_amd_event_t memory_fault_event; @@ -1687,7 +1725,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { // Find the faulty agent auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId); assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent."); - Agent* faulty_agent = it->second.front(); + faulty_agent = it->second.front(); fault_info.agent = Agent::Convert(faulty_agent); fault_info.virtual_address = fault.VirtualAddress; @@ -1749,12 +1787,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { reason += "Unknown"; } - core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0]; + faulty_agent = runtime_singleton_->agents_by_node_[fault.NodeId][0]; fprintf( stderr, "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n", - fault.NodeId, reinterpret_cast(faultingAgent->public_handle().handle), + fault.NodeId, reinterpret_cast(faulty_agent->public_handle().handle), reinterpret_cast(fault.VirtualAddress), (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str()); @@ -1762,6 +1800,16 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { PrintMemoryMapNear(reinterpret_cast(fault.VirtualAddress)); #endif } + // Fallback if KFD does not support GPU core dump. In this case, there core dump is + // generated by hsa-runtime. + if (faulty_agent && faulty_agent->isa()->GetMajorVersion() != 11 && + !runtime_singleton_->KfdVersion().supports_core_dump) { + + if (pcs::PcsRuntime::instance()->SessionsActive()) + fprintf(stderr, "GPU core dump skipped because PC Sampling active\n"); + else if (amd::coredump::dump_gpu_core()) + fprintf(stderr, "GPU core dump failed\n"); + } assert(false && "GPU memory access fault."); std::abort(); } @@ -1953,6 +2001,11 @@ void Runtime::LoadExtensions() { extensions_.LoadImage(); hsa_api_table_.LinkExts(&extensions_.image_api, core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID); + + // Update Hsa Api Table with handle of PCS extension Apis + extensions_.LoadPcSampling(); + hsa_api_table_.LinkExts(&extensions_.pcs_api, + core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID); } void Runtime::UnloadExtensions() { extensions_.Unload(); } @@ -2909,18 +2962,23 @@ hsa_status_t Runtime::DmaBufClose(int dmabuf) { } hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t address, - uint64_t flags) { + uint64_t alignment, uint64_t flags) { void* addr = (void*)address; HsaMemFlags memFlags = {}; + + if (!alignment) + alignment = sysconf(_SC_PAGE_SIZE); + ScopedAcquire lock(&memory_lock_); memFlags.ui32.OnlyAddress = 1; memFlags.ui32.FixedAddress = 1; + /* Try to reserving the VA requested by user */ - if (hsaKmtAllocMemory(0, size, memFlags, &addr) != HSAKMT_STATUS_SUCCESS) { + if (hsaKmtAllocMemoryAlign(0, size, alignment, memFlags, &addr) != HSAKMT_STATUS_SUCCESS) { memFlags.ui32.FixedAddress = 0; /* Could not reserved VA requested, allocate alternate VA */ - if (hsaKmtAllocMemory(0, size, memFlags, &addr) != HSAKMT_STATUS_SUCCESS) + if (hsaKmtAllocMemoryAlign(0, size, alignment, memFlags, &addr) != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } diff --git a/src/core/runtime/signal.cpp b/src/core/runtime/signal.cpp index eee62f595..065f8ccc9 100644 --- a/src/core/runtime/signal.cpp +++ b/src/core/runtime/signal.cpp @@ -52,7 +52,7 @@ namespace rocr { namespace core { -HybridMutex Signal::ipcLock_; +KernelMutex Signal::ipcLock_; std::map Signal::ipcMap_; void SharedSignalPool_t::clear() { @@ -128,7 +128,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable) } void Signal::registerIpc() { - ScopedAcquire lock(&ipcLock_); + ScopedAcquire lock(&ipcLock_); auto handle = Convert(this); assert(ipcMap_.find(handle.handle) == ipcMap_.end() && "Can't register the same IPC signal twice."); @@ -136,7 +136,7 @@ void Signal::registerIpc() { } bool Signal::deregisterIpc() { - ScopedAcquire lock(&ipcLock_); + ScopedAcquire lock(&ipcLock_); if (refcount_ != 0) return false; auto handle = Convert(this); const auto& it = ipcMap_.find(handle.handle); @@ -146,14 +146,14 @@ bool Signal::deregisterIpc() { } Signal* Signal::lookupIpc(hsa_signal_t signal) { - ScopedAcquire lock(&ipcLock_); + ScopedAcquire lock(&ipcLock_); const auto& it = ipcMap_.find(signal.handle); if (it == ipcMap_.end()) return nullptr; return it->second; } Signal* Signal::duplicateIpc(hsa_signal_t signal) { - ScopedAcquire lock(&ipcLock_); + ScopedAcquire lock(&ipcLock_); const auto& it = ipcMap_.find(signal.handle); if (it == ipcMap_.end()) return nullptr; it->second->refcount_++; diff --git a/src/core/runtime/trap_handler/CMakeLists.txt b/src/core/runtime/trap_handler/CMakeLists.txt index 2196cb0e9..251572412 100644 --- a/src/core/runtime/trap_handler/CMakeLists.txt +++ b/src/core/runtime/trap_handler/CMakeLists.txt @@ -46,8 +46,9 @@ cmake_minimum_required ( VERSION 3.7 ) find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) -set (TARGET_DEVS "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1030;gfx1100") -set (POSTFIX "9;940;941;942;1010;10;11") +set (TARGET_DEVS "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1030;gfx1100;gfx1200") +set (POSTFIX "9;940;941;942;1010;10;11;12") +set (SOURCE_SUFFIX ";;;;;;;_gfx12") if(${CMAKE_VERBOSE_MAKEFILE}) get_property(clang_path TARGET clang PROPERTY LOCATION) @@ -92,11 +93,11 @@ endfunction(gen_kernel_bc) ##========================================== ## Find device code object name and forward to custom command ##========================================== -function(build_kernel TRAP_HANDLER_NAME TARGET_ID POSTFIX) +function(build_kernel TRAP_HANDLER_NAME TARGET_ID POSTFIX SOURCE_SUFFIX) ## generate trap handler object code files set (CODE_OBJECT_FILE "${TRAP_HANDLER_NAME}_${POSTFIX}") - set (TRAP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/trap_handler.s") + set (TRAP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/trap_handler${SOURCE_SUFFIX}.s") gen_kernel_bc(${TARGET_ID} ${TRAP_FILE} ${CODE_OBJECT_FILE}) ## Build a list of code object file names @@ -117,10 +118,11 @@ function(build_kernel_for_devices TRAP_HANDLER_NAME) foreach(ind RANGE ${dev_count}) list(GET TARGET_DEVS ${ind} dev) list(GET POSTFIX ${ind} post) + list(GET SOURCE_SUFFIX ${ind} suffix) if(${CMAKE_VERBOSE_MAKEFILE}) message("\n Generating: ${dev} ...") endif() - build_kernel(${TRAP_HANDLER_NAME} ${dev} ${post}) + build_kernel(${TRAP_HANDLER_NAME} ${dev} ${post} "${suffix}") endforeach(ind) set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE) diff --git a/src/core/runtime/trap_handler/trap_handler.s b/src/core/runtime/trap_handler/trap_handler.s index 0936786c5..3933c926e 100644 --- a/src/core/runtime/trap_handler/trap_handler.s +++ b/src/core/runtime/trap_handler/trap_handler.s @@ -47,12 +47,14 @@ .set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 .set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) .set SQ_WAVE_STATUS_HALT_SHIFT , 13 +.set SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT , 18 .set SQ_WAVE_STATUS_HALT_BFE , (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16)) .set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT , 8 .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11 .set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28 .set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F .set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12 +.set SQ_WAVE_MODE_EXCP_EN_SIZE , 8 .set TRAP_ID_ABORT , 2 .set TRAP_ID_DEBUGTRAP , 3 .set DOORBELL_ID_SIZE , 10 @@ -64,6 +66,7 @@ .set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4)) .set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5)) +.set TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT , 31 .set TTMP6_WAVE_STOPPED_SHIFT , 30 .set TTMP6_SAVED_STATUS_HALT_SHIFT , 29 .set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) @@ -90,21 +93,34 @@ .if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 .set TTMP11_TTMPS_SETUP_SHIFT , 31 + + // Bit to indicate that this is a hosttrap trap instead of stochastic trap + // Currently not used + .set TTMP13_PCS_IS_STOCHASTIC , 24 .endif // ABI between first and second level trap handler: -// ttmp0 = PC[31:0] +// ttmp0 = PC[31:0] +// ttmp8 = WorkgroupIdX +// ttmp9 = WorkgroupIdY +// ttmp10 = WorkgroupIdZ // ttmp12 = SQ_WAVE_STATUS // ttmp14 = TMA[31:0] // ttmp15 = TMA[63:32] // gfx9: // ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] -// gfx906/gfx908/gfx90a: -// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +// all gfx9 (except gfx940, gfx941, gfx942): +// ttmp6 = 0[6:0], DispatchPktIndx[24:0] +// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveInWg[5:0] +// Note: Once stochastic sampling is implemented, L2 Trap Handler will use Bit 23 +// (TTMP11_PCS_IS_STOCHASTIC) to differentiate between stochastic and hosttrap // gfx940/gfx941/gfx942: +// ttmp11 = 0[0], DispatchPktIndx[24:0], WaveIdInWg[5:0] // ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0] // gfx10: // ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32] +// gfx10/gfx11: +// ttmp6 = 0[6:0], DispatchPktIndx[24:0] // gfx1010: // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] // gfx1030/gfx1100: @@ -115,6 +131,31 @@ trap_entry: s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE s_cbranch_scc0 .no_skip_debugtrap +.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) // PC_SAMPLING_GFX9 + // ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, ttmp13 + // Check if this is a host-trap. For now, if so, that means we are sampling + // + // TMA2 layout: + // [0x00] out_buf_t* host_trap_buffers; + // [0x08] out_buf_t* stochastic_trap_buffers; + // + // --- Start profile trap handlers GFX9 --- // + // if (host_trap) { + // if (stochastic) // Not implemented yet + // ttmp11.bit23 = 1; // Not implemented yet + // profiling_trap_handler(tma->host_trap_buffers); + // } + + s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT + s_cbranch_scc0 .not_host_trap_gfx9 + s_load_dwordx2 ttmp[14:15], ttmp[14:15], 0 glc // ttmp[14:15]=&host_trap_buffers + // TODO: When implementing stochastic sampling, need to set TTMP11_PCS_IS_STOCHASTIC + // or TTMP13_PCS_IS_STOCHASTIC to differentiate between hosttrap and stochastic sampling + s_waitcnt lgkmcnt(0) + s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers + +.not_host_trap_gfx9: +.endif // PC_SAMPLING_GFX9 // If caused by s_trap then advance PC. s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT s_cbranch_scc1 .not_s_trap @@ -134,7 +175,253 @@ trap_entry: // Ignore llvm.debugtrap. s_branch .exit_trap +.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) // PC_SAMPLING_GFX9 + // tma->host_trap_buffers Offsets: + // [0x00] uint64_t buf_write_val; + // [0x08] uint32_t buf_size; + // [0x0c] uint32_t reserved0; + // [0x10] uint32_t buf_written_val0; + // [0x14] uint32_t buf_watermark0; + // [0x18] hsa_signal_t done_sig0; + // [0x20] uint32_t buf_written_val1; + // [0x24] uint32_t buf_watermark1; + // [0x28] hsa_signal_t done_sig1; + // [0x30] uint8_t reserved1[16]; + // [0x40] sample_t buffer0[buf_size]; + // [0x40+(buf_size*sizeof(sample_t))]sample_t buffer1[buf_size]; + // + //__global__ void profiling_trap_handler(out_buf_t* tma) { + // uint64_t local_entry = atomicAdd(&tma->buf_write_val, 1); + // int buf_to_use = local_entry >> 63; + // local_entry &= (ULLONG_MAX >> 1); + // + // if (local_entry < tma->buf_size) { + // sample_t *buf_base = buf_to_use ? tma->buffer1 : tma->buffer0; + // fill_sample(&buf_base[local_entry]); // reads TTMP11 as well + // + // uint32_t * written = buf_to_use ? &(tma->buf_written_val1) : + // &(tma->buf_written_val0); + // + // uint64_t done = __atomic_fetch_add(&written, 1, + // memory_order_release, memory_scope_system); + // + // uint32_t watermark = buf_to_use ? tma->buf_watermark0 : + // tma->buf_watermark1; + // if (done == watermark) { + // hsa_signal_t done_sig = buf_to_use ? tma->done_sig1 : + // tma->done_sig0; + // send_signal(done_sig); + // } + // } + //} + + // ttmp[14:15] is tma->host_trap_buffers; Available: ttmp[2:3], ttmp[4:5], ttmp7, ttmp13 +.profile_trap_handlers_gfx9: + s_mov_b64 ttmp[2:3], 1 // atomic increment buf_write_val + s_atomic_add_x2 ttmp[2:3], ttmp[14:15], glc // ttmp[2:3] = packed local_entry + s_load_dword ttmp13, ttmp[14:15], 0x8 // ttmp13 = tma->buf_size + s_waitcnt lgkmcnt(0) + s_lshr_b32 ttmp7, ttmp3, 31 // ttmp7 = buf_to_use + s_bitset0_b32 ttmp6, 31 // clear out ttmp6 bit31 + s_cmp_eq_u32 ttmp7, 0 // store off buf_to_use ... + s_cbranch_scc1 .skip_ttmp6_set_gfx9 // into bit31 of ttmp6 + s_bitset1_b32 ttmp6, 31 +.skip_ttmp6_set_gfx9: + s_bfe_u64 ttmp[2:3], ttmp[2:3], (63<<16) // ttmp[2:3] = new local_entry + s_cmp_lg_u32 ttmp3, 0 // if entry >= 2^32, always lost + s_cbranch_scc1 .pc_sampling_exit + s_cmp_ge_u32 ttmp2, ttmp13 // if local_entry >= buf_size + s_cbranch_scc1 .pc_sampling_exit + + // ttmp2=local_entry, ttmp7=buf_to_use (also in bit31 of ttmp6), ttmp13=buf_size + // ttmp[14:15] is tma->host_trap_buffers. Available: ttmp3, ttmp[4:5] + s_mul_i32 ttmp13, ttmp13, ttmp7 // ttmp[4:5]=buf_size if ... + s_mul_i32 ttmp4, ttmp13, 0x40 // buf_to_use=1, 0 otherwise + s_mul_hi_u32 ttmp5, ttmp13, 0x40 + + s_add_u32 ttmp4, ttmp4, 0x40 // now ttmp[4:5]=offset from ... + s_addc_u32 ttmp5, ttmp5, 0 // tma to start of target buffer; + s_add_u32 ttmp4, ttmp14, ttmp4 // ttmp[4:5] now points to ... + s_addc_u32 ttmp5, ttmp15, ttmp5 // buffer0 or buffer1 + s_mov_b32 ttmp7, ttmp2 + + // ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX", + // ttmp[14:15] holds 'tma->host_trap_buffers' pointer + // ttmp[2:3] and ttmp13 are available for gathering perf sample info + // ttmp[14:15] is live out + + // fill_sample(...) - begin // + // typedef struct { + // [0x00] uint64_t pc; + // [0x08] uint64_t exec_mask; + // [0x10] uint32_t workgroup_id_x; + // [0x14] uint32_t workgroup_id_y; + // [0x18] uint32_t workgroup_id_z; + // [0x1c] uint32_t wave_in_wg : 6; + // uint32_t chiplet : 3; // Currently not used + // uint32_t reserved : 23; + // [0x20] uint32_t hw_id; + // [0x24] uint32_t reserved0; + // [0x28] uint64_t reserved1; + // [0x30] uint64_t timestamp; + // [0x38] uint64_t correlation_id; + // } perf_sample_hosttrap_v1_t; + // + // __device__ void fill_sample_hosttrap_v1(perf_sample_hosttrap_v1_t* buf) { + // buf->pc = ((ttmp1 & 0xffff) << 32) | ttmp0; + // buf->exec_mask = EXEC; + // buf->workgroup_id_x = ttmp8; + // buf->workgroup_id_y = ttmp9; + // buf->workgroup_id_z = ttmp10; + // buf->chiplet_and_wave_id = ttmp11 & 0x3f; + // buf->hw_id = s_getreg_b32(HW_REG_HW_ID); + // buf->timestamp = s_memrealtime; + // buf->correlation_id = get_correlation_id(); + // } + + s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects + s_mul_hi_u32 ttmp3, ttmp7, 0x40 // ttmp[2:3] will contain byte ... + s_add_u32 ttmp2, ttmp2, ttmp4 + s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry] + s_memrealtime ttmp[4:5] + s_and_b32 ttmp1, ttmp1, 0xffff // clear out extra data from PC_HI + s_store_dwordx2 ttmp[0:1], ttmp[2:3] // store PC + s_waitcnt lgkmcnt(0) // wait for timestamp + s_mov_b32 ttmp13, exec_lo + s_store_dword ttmp13, ttmp[2:3], 0x8 // store EXEC_LO + s_mov_b32 ttmp13, exec_hi + s_store_dword ttmp13, ttmp[2:3], 0xc // store EXEC_HI + s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y + s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z + s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x30 // store timestamp + s_and_b32 ttmp4, ttmp11, 0x3f + s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg + + // Get HW_ID using S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and HW_ID = 4 (0x4) + s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) + s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID + + // ttmp[2:3] = &buffer[local_entry]; ttmp[4:5], ttmp7, and ttmp13 are free + // ttmp[14:15] = tma->host_trap_buffers and is live out; ttmp6.b31 is buf_to_use, 0 or 1 + + // get_correlation_id() -- begin // + // Returns a value to use as a correlation ID. + // Returns a 64bit number made up of the 9-bit queue ID and the + // 25-bit dispatch_pkt concatenated together as: + // Upper 32 bits: {23 0s}{9b queue_id} + // Lower 32 bits: { 7 0s}{25b dispatch_pkt} + // __device__ uint64_t get_correlation_id() { + // uint64_t output; + // // Get bottom 10 bits of queue's doorbell, in doorbell region. + // // Doorbell is 8B (3b per); region is 8K (13b total) so 10 bits. + // output = s_sendmsg(MSG_GET_DOORBELL); + // output &= 0x3ff; + // output <<= 32; + // // TTMP6 contains this packet dispatch ID modulus the queue size + // output |= TTMP6; + // return output; + // } + + // ttmp[2:3] = &buffer[local_entry] + // ttmp[4:5], ttmp7, and ttmp13 are free + // ttmp[14:15] = tma->host_trap_buffers and is live out + // ttmp6.b31 is buf_to_use, 0 or 1 and is live out + s_mov_b64 ttmp[4:5], exec // back up EXEC mask + s_mov_b32 exec_lo, 0x80000000 // prepare EXEC for doorbell spin + s_sendmsg sendmsg(MSG_GET_DOORBELL) // message 10, puts doorbell in EXEC +.wait_for_doorbell: + s_nop 0x7 // wait a bit for message to return + s_bitcmp0_b32 exec_lo, 0x1f // returned message will 0 bit 31 + s_cbranch_scc0 .wait_for_doorbell // wait some more if no data yet + s_mov_b32 exec_hi, ttmp5 // do not care about message[63:32] + s_and_b32 ttmp5, exec_lo, DOORBELL_ID_MASK // doorbell now in ttmp5 + s_mov_b32 exec_lo, ttmp4 // exec mask restored + s_and_b32 ttmp4, ttmp6, 0x1ffffff // extract low 25 bits from ttmp6 (DispatchPktIndx[24:0]) + // ttmp[4:5] is correlation ID + s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x38 // store correlation_id to sample + // get_correlation_id() -- end // + + // complete stores before returning + s_dcache_wb + s_waitcnt lgkmcnt(0) + // fill_sample(...) - end // + + // ttmp[2:3], ttmp[4:5], ttmp7, and ttmp13 are free + // ttmp[14:15] = tma->host_trap_buffers; ttmp6.b31 is buf_to_use, 0 or 1 + s_lshr_b32 ttmp13, ttmp6, 31 // ttmp13 is buf_to_use + s_mulk_i32 ttmp13, 0x10 + // written_val0 to written_val_X + s_add_u32 ttmp14, ttmp14, ttmp13 // now ttmp[14:15] points to ... + s_addc_u32 ttmp15, ttmp15, 0x0 // buf_written_valX-0x10 + s_mov_b32 ttmp7, 1 // atomic increment buf_written_valX + s_atomic_add ttmp7, ttmp[14:15], 0x10 glc // ttmp7 will contain 'done' + s_load_dword ttmp13, ttmp[14:15], 0x14 // ttmp13 will hold watermark + s_waitcnt lgkmcnt(0) + s_cmp_lg_u32 ttmp7, ttmp13 // if 'done' not at watermark, exit + s_cbranch_scc1 .pc_sampling_exit + + // ttmp[2:3], [4:5], ttmp7, and ttmp13 are free + // ttmp[14:15] = buf_written_valX-0x10 + + // send_signal(...) - begin // + //__device__ void send_signal(hsa_signal_t* signal) { + // + // amd_signal_t *sig = (amd_signal_t *)signal->handle; + // __atomic_store(&(sig->value), 0, memory_order_relaxed, memory_scope_system); + // if (sig->event_mailbox_ptr != NULL && sig->event_id != NULL) { + // uint32_t id = sig->event_id; + // __atomic_store(sig->event_mailbox_ptr, id, + // memory_order_relaxed, memory_scope_system); + // __builtin_amdgcn_s_sendmsg(1, id); + // } + //} + // We jump to the trap handler exit after this, so no live-out registers except + // those that must survive the trap handler + + s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x18 // load done_sig into ttmp[2:3] + s_waitcnt lgkmcnt(0) // it's actually an amd_signal_t* + s_load_dwordx2 ttmp[4:5], ttmp[2:3], 0x10 // load event mailbox ptr into 4:5 + s_load_dword ttmp7, ttmp[2:3], 0x18 // load event_id into ttmp7 + s_mov_b64 ttmp[14:15], 0 + s_store_dwordx2 ttmp[14:15], ttmp[2:3], 0x8 glc // zero out signal value + s_waitcnt lgkmcnt(0) // wait for value store to complete + s_cmp_eq_u64 ttmp[4:5], 0 + s_cbranch_scc1 .pc_sampling_exit // null mailbox means no interrupt + s_cmp_eq_u32 ttmp7, 0 + s_cbranch_scc1 .pc_sampling_exit // event_id zero means no interrupt + s_store_dword ttmp7, ttmp[4:5] glc // send event ID to the mailbox + s_waitcnt lgkmcnt(0) + s_mov_b32 ttmp13, m0 // save off m0 + s_mov_b32 m0, ttmp7 // put ID into message payload + s_nop 0x0 // Manually inserted wait states + s_sendmsg sendmsg(MSG_INTERRUPT) // send interrupt message + s_waitcnt lgkmcnt(0) // wait for message to be sent + s_mov_b32 m0, ttmp13 // restore m0 + // send_signal(...) - end // +.pc_sampling_exit: + // We can receive regular exceptions while doing PC-Sampling so we need to make sure we + // handle these exceptions here + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN + // Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR + s_or_b32 ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT) + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp2, ttmp2, ttmp3 + // SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR + s_cbranch_scc1 .no_skip_debugtrap // if any of those are set, handle exceptions + + // Check for maskable exceptions + s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) + s_and_b32 ttmp3, ttmp2, ttmp3 + s_cbranch_scc1 .no_skip_debugtrap + + // Since we are in PC sampling, it is safe to ignore watch1/2/3 and single step + // as those should only be enabled by the debugger. + // We could add them for completeness, i.e. check MODE.DEBUG_EN (bit 11) + // and "MODE.EXCP_EN.WATCH (bit 19) && (TRAPSTS.EXCP_HI.ADDR_WATCH1 (bit 12) || TRAPSTS.EXCP_HI.ADDR_WATCH2 (bit 13) || TRAPSTS.EXCP_HI.ADDR_WATCH3 (bit 14)). + s_branch .exit_trap +.endif // PC_SAMPLING_GFX9 .no_skip_debugtrap: // Save trap id and halt status in ttmp6. s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) @@ -243,12 +530,16 @@ trap_entry: // Halt the wavefront upon restoring STATUS below. s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT + // Set WAVE.SKIP_EXPORT as a maker so the debugger knows the trap handler was + // entered and has decided to halt the wavee. + s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) s_bitcmp1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT s_cbranch_scc1 .ttmps_initialized s_mov_b32 ttmp4, 0 s_mov_b32 ttmp5, 0 + s_bitset0_b32 ttmp6, TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT s_bitset1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT .ttmps_initialized: .endif @@ -273,8 +564,8 @@ trap_entry: .endif // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_and_b64 exec, exec, exec // restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // restore STATUS.VCCZ, not writable by s_setreg_b32 s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 // Return to original (possibly modified) PC. diff --git a/src/core/runtime/trap_handler/trap_handler_gfx12.s b/src/core/runtime/trap_handler/trap_handler_gfx12.s new file mode 100644 index 000000000..2289c57d7 --- /dev/null +++ b/src/core/runtime/trap_handler/trap_handler_gfx12.s @@ -0,0 +1,226 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/// Trap Handler V2 source +.set DOORBELL_ID_SIZE , 10 +.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1) +.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0)) +.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1)) +.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2)) +.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3)) +.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4)) +.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5)) +.set SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT , 4 +.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT , 7 +.set SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT , 6 +.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT , 8 +.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT , 0 +.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE , 6 +.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_SHIFT , 0 +.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_SIZE , 6 +.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF +.set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) +.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 28 +.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 4 +.set SQ_WAVE_STATE_PRIV_HALT_BFE , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16)) +.set SQ_WAVE_STATE_PRIV_HALT_SHIFT , 14 +.set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT , 2 +.set TRAP_ID_ABORT , 2 +.set TRAP_ID_DEBUGTRAP , 3 +.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) +.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29 +.set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) +.set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) +.set TTMP6_SAVED_TRAP_ID_SHIFT , 25 +.set TTMP6_SAVED_TRAP_ID_SIZE , 4 +.set TTMP6_WAVE_STOPPED_SHIFT , 30 +.set TTMP8_DEBUG_FLAG_SHIFT , 31 +.set TTMP11_DEBUG_ENABLED_SHIFT , 23 +.set TTMP_PC_HI_SHIFT , 7 + +// ABI between first and second level trap handler: +// { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0] +// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0] +// ttmp12 = SQ_WAVE_STATE_PRIV +// ttmp14 = TMA[31:0] +// ttmp15 = TMA[63:32] + +trap_entry: + // Branch if not a trap (an exception instead). + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE + s_cbranch_scc0 .no_skip_debugtrap + + // If caused by s_trap then advance PC. + s_add_u32 ttmp0, ttmp0, 0x4 + s_addc_u32 ttmp1, ttmp1, 0x0 + +.not_s_trap: + // If llvm.debugtrap and debugger is not attached. + s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP + s_cbranch_scc0 .no_skip_debugtrap + + s_bitcmp0_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT + s_cbranch_scc0 .no_skip_debugtrap + + // Ignore llvm.debugtrap. + s_branch .exit_trap + +.no_skip_debugtrap: + // Save trap id and halt status in ttmp6. + s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) + s_min_u32 ttmp2, ttmp2, 0xF + s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT + s_or_b32 ttmp6, ttmp6, ttmp2 + s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE + s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT + s_or_b32 ttmp6, ttmp6, ttmp2 + + // Fetch doorbell id for our queue. + s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL) + s_wait_kmcnt 0 + s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK + + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) + + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT + s_cbranch_scc0 .not_memory_violation + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 + + // Aperture violation requires XNACK_ERROR == 0. + s_branch .not_aperture_violation + +.not_memory_violation: + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT + s_cbranch_scc0 .not_aperture_violation + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 + +.not_aperture_violation: + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT + s_cbranch_scc0 .not_illegal_instruction + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 + +.not_illegal_instruction: + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE) + s_cbranch_scc0 .not_math_exception + s_getreg_b32 ttmp10, hwreg(HW_REG_TRAP_CTRL, SQ_WAVE_TRAP_CTRL_MATH_EXCP_SHIFT, SQ_WAVE_TRAP_CTRL_MATH_EXCP_SIZE) + s_and_b32 ttmp2, ttmp2, ttmp10 + + s_cbranch_scc0 .not_math_exception + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 + +.not_math_exception: + s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE + s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT + s_cbranch_scc0 .not_abort_trap + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 + +.not_abort_trap: + // If no other exception was flagged then report a generic error. + s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK + s_cbranch_scc1 .send_interrupt + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + +.send_interrupt: + // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id + s_mov_b32 ttmp2, m0 + s_mov_b32 m0, ttmp3 + s_nop 0x0 // Manually inserted wait states + s_sendmsg sendmsg(MSG_INTERRUPT) + // Wait for the message to go out. + s_wait_kmcnt 0 + s_mov_b32 m0, ttmp2 + + // Parking the wave requires saving the original pc in the preserved ttmps. + // Register layout before parking the wave: + // + // ttmp10: ?[31:0] + // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0] + // + // After parking the wave: + // + // ttmp10: pc_lo[31:0] + // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] + // + // Save the PC + s_mov_b32 ttmp10, ttmp0 + s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK + s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT + s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) + s_or_b32 ttmp11, ttmp11, ttmp1 + + // Park the wave + s_getpc_b64 [ttmp0, ttmp1] + s_add_u32 ttmp0, ttmp0, .parked - . + s_addc_u32 ttmp1, ttmp1, 0x0 + +.halt_wave: + // Halt the wavefront upon restoring STATUS below. + s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT + s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT + + // Initialize TTMP registers + s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT + s_cbranch_scc1 .ttmps_initialized + s_mov_b32 ttmp4, 0 + s_mov_b32 ttmp5, 0 + s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT +.ttmps_initialized: + +.exit_trap: + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12 + s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1) + s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12 + + // Return to original (possibly modified) PC. + s_rfe_b64 [ttmp0, ttmp1] + +.parked: + s_trap 0x2 + s_branch .parked + +// Add s_code_end padding so instruction prefetch always has something to read. +.rept (256 - ((. - trap_entry) % 64)) / 4 + s_code_end +.endr diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 43d28bf34..f720b4c4d 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -67,6 +67,7 @@ class Flag { // Lift limit for 2.10 release RCCL workaround. This limit is not used when asynchronous scratch // reclaim is supported const size_t DEFAULT_SCRATCH_SINGLE_LIMIT = 146800640; // small_limit >> 2; + const size_t DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE = 256 * 1024 * 1024; explicit Flag() { Refresh(); } @@ -184,6 +185,9 @@ class Flag { var = os::GetEnvVar("HSA_DISABLE_IMAGE"); disable_image_ = (var == "1") ? true : false; + var = os::GetEnvVar("HSA_DISABLE_PC_SAMPLING"); + disable_pc_sampling_ = (var == "1") ? true : false; + var = os::GetEnvVar("HSA_LOADER_ENABLE_MMAP_URI"); loader_enable_mmap_uri_ = (var == "1") ? true : false; @@ -228,6 +232,14 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_IPC_MODE_LEGACY"); enable_ipc_mode_legacy_ = (var == "1") ? true : true; // Temporarily always enable + if (os::IsEnvVarSet("HSA_PCS_MAX_DEVICE_BUFFER_SIZE")) { + var = os::GetEnvVar("HSA_PCS_MAX_DEVICE_BUFFER_SIZE"); + char* end; + pc_sampling_max_device_buffer_size_ = strtoul(var.c_str(), &end, 10); + } else { + pc_sampling_max_device_buffer_size_ = DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE; + } + // Temporary environment variable to disable CPU affinity override // Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely. var = os::GetEnvVar("HSA_OVERRIDE_CPU_AFFINITY_DEBUG"); @@ -297,6 +309,8 @@ class Flag { bool disable_image() const { return disable_image_; } + bool disable_pc_sampling() const { return disable_pc_sampling_; } + bool loader_enable_mmap_uri() const { return loader_enable_mmap_uri_; } size_t force_sdma_size() const { return force_sdma_size_; } @@ -336,6 +350,8 @@ class Flag { bool enable_ipc_mode_legacy() const { return enable_ipc_mode_legacy_; } + size_t pc_sampling_max_device_buffer_size() const { return pc_sampling_max_device_buffer_size_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -353,6 +369,7 @@ class Flag { bool no_scratch_reclaim_; bool no_scratch_thread_limit_; bool disable_image_; + bool disable_pc_sampling_; bool loader_enable_mmap_uri_; bool check_sramecc_validity_; bool debug_; @@ -390,6 +407,8 @@ class Flag { SRAMECC_ENABLE sramecc_enable_; + size_t pc_sampling_max_device_buffer_size_; + // Map GPU index post RVD to its default cu mask. std::map> cu_mask_; diff --git a/src/core/util/lazy_ptr.h b/src/core/util/lazy_ptr.h index e2a847b5c..2aef6a3bf 100644 --- a/src/core/util/lazy_ptr.h +++ b/src/core/util/lazy_ptr.h @@ -59,7 +59,7 @@ template class lazy_ptr { public: lazy_ptr() {} - explicit lazy_ptr(std::function Constructor) { Init(Constructor); } + explicit lazy_ptr(std::function Constructor) { reset(Constructor); } lazy_ptr(lazy_ptr&& rhs) { obj = std::move(rhs.obj); diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp index b13c907b8..d36dc0d5d 100644 --- a/src/core/util/lnx/os_linux.cpp +++ b/src/core/util/lnx/os_linux.cpp @@ -108,13 +108,20 @@ class os_thread { err = pthread_attr_setstacksize(&attrib, stackSize); if (err != 0) { fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err)); - return; + err = pthread_attr_destroy(&attrib); + if (err != 0) { + fprintf(stderr, "pthread_attr_destroy failed: %s\n", strerror(err)); + return; + } } } + int cores = 0; + cpu_set_t* cpuset = nullptr; + if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) { - int cores = get_nprocs_conf(); - cpu_set_t* cpuset = CPU_ALLOC(cores); + cores = get_nprocs_conf(); + cpuset = CPU_ALLOC(cores); if (cpuset == nullptr) { fprintf(stderr, "CPU_ALLOC failed: %s\n", strerror(errno)); return; @@ -126,7 +133,7 @@ class os_thread { err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset); CPU_FREE(cpuset); if (err != 0) { - fprintf(stderr, "pthread_attr_setaffinity_np failed: %s\n", strerror(err)); + fprintf(stderr, "pthread_setaffinity_np failed: %s\n", strerror(err)); return; } } @@ -642,11 +649,20 @@ SharedMutex CreateSharedMutex() { fprintf(stderr, "rw lock attribute init failed: %s\n", strerror(err)); return nullptr; } + +#ifdef __GLIBC__ err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); if (err != 0) { fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err)); return nullptr; } +#else + err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#endif pthread_rwlock_t* lock = new pthread_rwlock_t; err = pthread_rwlock_init(lock, &attrib); diff --git a/src/core/util/utils.h b/src/core/util/utils.h index ab536ba79..1a454d7bd 100644 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -74,8 +74,7 @@ static __forceinline void* _aligned_malloc(size_t size, size_t alignment) { return aligned_alloc(alignment, size); #else void *mem = NULL; - if (NULL != posix_memalign(&mem, alignment, size)) - return NULL; + if (0 != posix_memalign(&mem, alignment, size)) return NULL; return mem; #endif } diff --git a/src/hsacore.so.def b/src/hsacore.so.def index 4d9c92186..dd9b554a1 100644 --- a/src/hsacore.so.def +++ b/src/hsacore.so.def @@ -234,6 +234,7 @@ global: hsa_amd_portable_export_dmabuf; hsa_amd_portable_close_dmabuf; hsa_amd_vmem_address_reserve; + hsa_amd_vmem_address_reserve_align; hsa_amd_vmem_address_free; hsa_amd_vmem_handle_create; hsa_amd_vmem_handle_release; @@ -252,7 +253,14 @@ global: hsa_tools_scratch_event_free_end; hsa_tools_scratch_event_async_reclaim_start; hsa_tools_scratch_event_async_reclaim_end; - + hsa_ven_amd_pcs_iterate_configuration; + hsa_ven_amd_pcs_create; + hsa_ven_amd_pcs_create_from_id; + hsa_ven_amd_pcs_destroy; + hsa_ven_amd_pcs_start; + hsa_ven_amd_pcs_stop; + hsa_ven_amd_pcs_flush; + hsa_amd_queue_get_info; local: *; }; diff --git a/src/image/addrlib/inc/addrinterface.h b/src/image/addrlib/inc/addrinterface.h index 5260426b6..39dd282c7 100644 --- a/src/image/addrlib/inc/addrinterface.h +++ b/src/image/addrlib/inc/addrinterface.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -36,9 +19,8 @@ #include "addrtypes.h" namespace rocr { - -#define ADDRLIB_VERSION_MAJOR 6 -#define ADDRLIB_VERSION_MINOR 2 +#define ADDRLIB_VERSION_MAJOR 8 +#define ADDRLIB_VERSION_MINOR 10 #define ADDRLIB_VERSION ((ADDRLIB_VERSION_MAJOR << 16) | ADDRLIB_VERSION_MINOR) /// Virtually all interface functions need ADDR_HANDLE as first parameter @@ -47,6 +29,13 @@ typedef VOID* ADDR_HANDLE; /// Client handle used in callbacks typedef VOID* ADDR_CLIENT_HANDLE; +typedef struct _ADDR_EXTENT3D +{ + UINT_32 width; + UINT_32 height; + UINT_32 depth; // also slices for 2D images +} ADDR_EXTENT3D; + /** * ///////////////////////////////////////////////////////////////////////////////////////////////// * // Callback functions @@ -124,7 +113,7 @@ typedef union _ADDR_CHANNEL_SETTING struct { UINT_8 valid : 1; ///< Indicate whehter this channel setting is valid - UINT_8 channel : 2; ///< 0 for x channel, 1 for y channel, 2 for z channel + UINT_8 channel : 2; ///< 0 for x channel, 1 for y channel, 2 for z channel, 3 for MSAA sample index UINT_8 index : 5; ///< Channel index }; UINT_8 value; ///< Value @@ -158,18 +147,29 @@ typedef union _ADDR_EQUATION_KEY * @brief address equation structure **************************************************************************************************** */ -#define ADDR_MAX_EQUATION_BIT 20u +#define ADDR_MAX_LEGACY_EQUATION_COMP 3u +#define ADDR_MAX_EQUATION_COMP 5u +#define ADDR_MAX_EQUATION_BIT 20u // Invalid equation index #define ADDR_INVALID_EQUATION_INDEX 0xFFFFFFFF typedef struct _ADDR_EQUATION { - ADDR_CHANNEL_SETTING addr[ADDR_MAX_EQUATION_BIT]; ///< addr setting - ///< each bit is result of addr ^ xor ^ xor2 - ADDR_CHANNEL_SETTING xor1[ADDR_MAX_EQUATION_BIT]; ///< xor setting - ADDR_CHANNEL_SETTING xor2[ADDR_MAX_EQUATION_BIT]; ///< xor2 setting + union + { + struct { + ADDR_CHANNEL_SETTING addr[ADDR_MAX_EQUATION_BIT]; ///< addr setting + ADDR_CHANNEL_SETTING xor1[ADDR_MAX_EQUATION_BIT]; ///< xor setting + ADDR_CHANNEL_SETTING xor2[ADDR_MAX_EQUATION_BIT]; ///< xor2 setting + ADDR_CHANNEL_SETTING xor3[ADDR_MAX_EQUATION_BIT]; ///< xor3 setting + ADDR_CHANNEL_SETTING xor4[ADDR_MAX_EQUATION_BIT]; ///< xor4 setting + }; + ///< Components showing the sources of each bit; each bit is result of addr ^ xor ^ xor2... + ADDR_CHANNEL_SETTING comps[ADDR_MAX_EQUATION_COMP][ADDR_MAX_EQUATION_BIT]; + }; UINT_32 numBits; ///< The number of bits in equation + UINT_32 numBitComponents; ///< The max number of channels contributing to a bit BOOL_32 stackedDepthSlices; ///< TRUE if depth slices are treated as being ///< stacked vertically prior to swizzling } ADDR_EQUATION; @@ -1723,6 +1723,30 @@ typedef enum _AddrSwizzleGenOption ADDR_SWIZZLE_GEN_LINEAR = 1, ///< Using a linear increment of swizzle } AddrSwizzleGenOption; +/** +**************************************************************************************************** +* AddrBlockType +* +* @brief +* Macro define resource block type +**************************************************************************************************** +*/ +typedef enum +{ + AddrBlockLinear = 0, // Resource uses linear swizzle mode + AddrBlockMicro = 1, // Resource uses 256B block + AddrBlockThin4KB = 2, // Resource uses thin 4KB block + AddrBlockThick4KB = 3, // Resource uses thick 4KB block + AddrBlockThin64KB = 4, // Resource uses thin 64KB block + AddrBlockThick64KB = 5, // Resource uses thick 64KB block + AddrBlockThinVar = 6, // Resource uses thin var block + AddrBlockThickVar = 7, // Resource uses thick var block + AddrBlockMaxTiledType, + + AddrBlockThin256KB = AddrBlockThinVar, + AddrBlockThick256KB = AddrBlockThickVar, +} AddrBlockType; + /** **************************************************************************************************** * AddrSwizzleOption @@ -2408,7 +2432,8 @@ typedef union _ADDR2_SURFACE_FLAGS UINT_32 metaRbUnaligned : 1; ///< This resource has rb unaligned metadata UINT_32 metaPipeUnaligned : 1; ///< This resource has pipe unaligned metadata UINT_32 view3dAs2dArray : 1; ///< This resource is a 3D resource viewed as 2D array - UINT_32 reserved : 13; ///< Reserved bits + UINT_32 allowExtEquation : 1; ///< If unset, only legacy DX eqs are allowed (2 XORs) + UINT_32 reserved : 12; ///< Reserved bits }; UINT_32 value; @@ -2585,7 +2610,7 @@ typedef struct _ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT { UINT_32 size; ///< Size of this structure in bytes - UINT_64 addr; ///< Byte address + UINT_64 addr; ///< Byte offset from the image starting address UINT_32 bitPosition; ///< Bit position within surfaceAddr, 0-7. /// For surface bpp < 8, e.g. FMT_1. UINT_32 prtBlockIndex; ///< Index of a PRT tile (64K block) @@ -3691,7 +3716,7 @@ typedef struct _ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT AddrResourceType resourceType; ///< Surface type AddrFormat format; ///< Surface format UINT_32 width; ///< Width of mip0 in texels (not in compressed block) - UINT_32 height; ///< Height of mip0 in texels (not in compressed block) + UINT_32 height; ///< Height of mip0 in texels (not in compressed block) UINT_32 numSlices; ///< Number surface slice/depth of mip0 UINT_32 numMipLevels; ///< Total mipmap levels. UINT_32 pipeBankXor; ///< Combined swizzle used to do bank/pipe rotation @@ -3924,6 +3949,20 @@ ADDR_E_RETURNCODE ADDR_API Addr2GetPreferredSurfaceSetting( const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut); +/** +**************************************************************************************************** +* Addr2GetPossibleSwizzleModes +* +* @brief +* Returns a list of swizzle modes that are valid from the hardware's perspective for the +* client to choose from +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr2GetPossibleSwizzleModes( + ADDR_HANDLE hLib, + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut); + /** **************************************************************************************************** * Addr2IsValidDisplaySwizzleMode @@ -3938,6 +3977,520 @@ ADDR_E_RETURNCODE ADDR_API Addr2IsValidDisplaySwizzleMode( UINT_32 bpp, BOOL_32 *pResult); -} // rocr +/** +**************************************************************************************************** +* Addr2GetAllowedBlockSet +* +* @brief +* Returns the set of allowed block sizes given the allowed swizzle modes and resource type +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedBlockSet( + ADDR_HANDLE hLib, + ADDR2_SWMODE_SET allowedSwModeSet, + AddrResourceType rsrcType, + ADDR2_BLOCK_SET* pAllowedBlockSet); + +/** +**************************************************************************************************** +* Addr2GetAllowedSwSet +* +* @brief +* Returns the set of allowed swizzle types given the allowed swizzle modes +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedSwSet( + ADDR_HANDLE hLib, + ADDR2_SWMODE_SET allowedSwModeSet, + ADDR2_SWTYPE_SET* pAllowedSwSet); + +/** +**************************************************************************************************** +* Addr2IsBlockTypeAvailable +* +* @brief +* Determine whether a block type is allowed in a given blockSet +**************************************************************************************************** +*/ +BOOL_32 Addr2IsBlockTypeAvailable(ADDR2_BLOCK_SET blockSet, AddrBlockType blockType); + +/** +**************************************************************************************************** +* Addr2BlockTypeWithinMemoryBudget +* +* @brief +* Determine whether a new block type is acceptable based on memory waste ratio. Will favor +* larger block types. +**************************************************************************************************** +*/ +BOOL_32 Addr2BlockTypeWithinMemoryBudget( + UINT_64 minSize, + UINT_64 newBlockTypeSize, + UINT_32 ratioLow, + UINT_32 ratioHi, +#if defined(__cplusplus) + DOUBLE memoryBudget = 0.0f, + BOOL_32 newBlockTypeBigger = TRUE); +#else + DOUBLE memoryBudget, + BOOL_32 newBlockTypeBigger); +#endif + +/** +**************************************************************************************************** +* ADDR3_SURFACE_FLAGS +* +* @brief +* Surface flags +**************************************************************************************************** +*/ +typedef union _ADDR3_SURFACE_FLAGS +{ + struct + { + UINT_32 color : 1; ///< This resource is a color buffer, can be used with RTV + UINT_32 depth : 1; ///< This resource is a depth buffer, can be used with DSV + UINT_32 stencil : 1; ///< This resource is a stencil buffer, can be used with DSV + UINT_32 texture : 1; ///< This resource can be used with SRV + UINT_32 unordered : 1; ///< This resource can be used with UAV + UINT_32 hiZHiS : 1; + UINT_32 blockCompressed : 1; + UINT_32 nv12 : 1; + UINT_32 p010 : 1; + UINT_32 view3dAs2dArray : 1; + UINT_32 isVrsImage : 1; ///< This resource is a VRS source image + UINT_32 reserved : 21; ///< Reserved bits + }; + + UINT_32 value; +} ADDR3_SURFACE_FLAGS; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SURFACE_INFO_INPUT +* +* @brief +* Input structure for Addr3ComputeSurfaceInfo +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SURFACE_INFO_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + + ADDR3_SURFACE_FLAGS flags; ///< Surface flags + Addr3SwizzleMode swizzleMode; ///< Swizzle Mode for Gfx12 + AddrResourceType resourceType; ///< Surface type + AddrFormat format; ///< Surface format + UINT_32 bpp; ///< bits per pixel + UINT_32 width; ///< Width (of mip0), in pixels + UINT_32 height; ///< Height (of mip0), in pixels + UINT_32 numSlices; ///< Number surface slice/depth (of mip0), + UINT_32 numMipLevels; ///< Total mipmap levels. + UINT_32 numSamples; ///< Number of samples + UINT_32 pitchInElement; ///< Pitch in elements (blocks for compressed formats) + UINT_32 sliceAlign; ///< Required slice size in bytes +} ADDR3_COMPUTE_SURFACE_INFO_INPUT; + +/** +**************************************************************************************************** +* ADDR3_MIP_INFO +* +* @brief +* Structure that contains information for mip level +* +**************************************************************************************************** +*/ +typedef struct _ADDR3_MIP_INFO +{ + UINT_32 pitch; ///< Pitch in elements + UINT_32 height; ///< Padded height in elements + UINT_32 depth; ///< Padded depth + UINT_32 pixelPitch; ///< Pitch in pixels + UINT_32 pixelHeight; ///< Padded height in pixels + UINT_32 equationIndex; ///< Equation index in the equation table + UINT_64 offset; ///< Offset in bytes from mip base, should only be used + ///< to setup vam surface descriptor, can't be used + ///< to setup swizzle pattern + UINT_64 macroBlockOffset; ///< macro block offset in bytes from mip base + UINT_32 mipTailOffset; ///< mip tail offset in bytes + UINT_32 mipTailCoordX; ///< mip tail coord x + UINT_32 mipTailCoordY; ///< mip tail coord y + UINT_32 mipTailCoordZ; ///< mip tail coord z +} ADDR3_MIP_INFO; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SURFACE_INFO_OUTPUT +* +* @brief +* Output structure for Addr3ComputeSurfaceInfo +* @note + Element: AddrLib unit for computing. e.g. BCn: 4x4 blocks; R32B32B32: 32bit with 3x pitch + Pixel: Original pixel +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SURFACE_INFO_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + UINT_32 pitch; ///< Pitch in elements (blocks for compressed formats) + UINT_32 pixelPitch; ///< Pitch in original pixels + UINT_32 pixelHeight; ///< Height in original pixels + UINT_32 pixelBits; ///< Original bits per pixel, passed from input + UINT_32 bpp; ///< Bits per elements + /// (e.g. blocks for BCn, 1/3 for 96bit) + UINT_32 numSlices; ///< Padded depth for 3d resource + /// or padded number of slices for 2d array resource + UINT_32 height; ///< Padded height (of mip0) in elements + UINT_64 sliceSize; ///< Slice (total mip chain) size in bytes + UINT_64 surfSize; ///< Surface (total mip chain) size in bytes + UINT_32 baseAlign; ///< Base address alignment + ADDR_EXTENT3D blockExtent; ///< Dimensions in element inside one block + UINT_32 pixelMipChainPitch; ///< Mip chain pitch in original pixels + UINT_32 pixelMipChainHeight; ///< Mip chain height in original pixels + ADDR3_MIP_INFO* pMipInfo; ///< Info regarding the start, sizes of the mip levels + BOOL_32 mipChainInTail; ///< If whole mipchain falls into mip tail block + UINT_32 firstMipIdInTail; ///< The id of first mip in tail, if there is no mip + /// in tail, it will be set to number of mip levels +} ADDR3_COMPUTE_SURFACE_INFO_OUTPUT; + +/** +**************************************************************************************************** +* ADDR3_SWMODE_SET +* +* @brief +* Bit field that defines swizzle type +**************************************************************************************************** +*/ +// The bit order MUST be the same as Addr3SwizzleMode enumerations, otherwise using bitset to enable +// or disable swizzle modes will be problematic. +typedef union _ADDR3_SWMODE_SET +{ + struct + { + UINT_32 swLinear : 1; + UINT_32 sw2d256B : 1; + UINT_32 sw2d4kB : 1; + UINT_32 sw2d64kB : 1; + UINT_32 sw2d256kB : 1; + UINT_32 sw3d4kB : 1; + UINT_32 sw3d64kB : 1; + UINT_32 sw3d256kB : 1; + UINT_32 reserved : 24; + }; + + UINT_32 value; +} ADDR3_SWMODE_SET; + +/** +**************************************************************************************************** +* ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT +* +* @brief +* Input structure of Addr3GetPossibleSwizzleModes +**************************************************************************************************** +*/ +typedef struct _ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + + ADDR3_SURFACE_FLAGS flags; ///< Surface flags + AddrResourceType resourceType; ///< Surface type + UINT_32 bpp; ///< bits per pixel + UINT_32 width; ///< Width (of mip0), in pixels + UINT_32 height; ///< Height (of mip0), in pixels + UINT_32 numSlices; ///< Number surface slice/depth (of mip0), + UINT_32 numMipLevels; ///< Total mipmap levels. + UINT_32 numSamples; ///< Number of samples + UINT_32 maxAlign; ///< maximum base/size alignment requested by client +} ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT; + +/** +**************************************************************************************************** +* ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT +* +* @brief +* Output structure of Addr3GetPossibleSwizzleModes +**************************************************************************************************** +*/ +typedef struct _ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + ADDR3_SWMODE_SET validModes; ///< List of valid swizzle modes for this function. +} ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT; + +/** +**************************************************************************************************** +* Addr3ComputeSurfaceInfo +* +* @brief +* Compute surface width/height/slices/alignments and suitable tiling mode +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceInfo( + ADDR_HANDLE hLib, + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut); + +/** +**************************************************************************************************** +* Addr3GetPossibleSwizzleModes +* +* @brief +* Returns a list of swizzle modes that are valid from the hardware's perspective for the +* client to choose from +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3GetPossibleSwizzleModes( + ADDR_HANDLE hLib, + const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT* pIn, + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT* pOut); + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT +* +* @brief +* Input structure for Addr3ComputeSurfaceAddrFromCoord +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + + UINT_32 x; ///< X coordinate + UINT_32 y; ///< Y coordinate + UINT_32 slice; ///< Slice index + UINT_32 sample; ///< Sample index, use fragment index for EQAA + UINT_32 mipId; ///< the mip ID in mip chain + + Addr3SwizzleMode swizzleMode; ///< Swizzle mode for Gfx12 + ADDR3_SURFACE_FLAGS flags; ///< Surface flags + AddrResourceType resourceType; ///< Surface type + UINT_32 bpp; ///< Bits per pixel + ADDR_EXTENT3D unAlignedDims; ///< Surface original dimensions (of mip0) + UINT_32 numMipLevels; ///< Total mipmap levels + UINT_32 numSamples; ///< Number of samples + UINT_32 pitchInElement; ///< Pitch in elements (blocks for compressed formats) +} ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT +* +* @brief +* Output structure for Addr3ComputeSurfaceAddrFromCoord +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + + UINT_64 addr; ///< Byte offset from the image starting address + UINT_32 bitPosition; ///< Bit position within surfaceAddr, 0-7. + /// For surface bpp < 8, e.g. FMT_1. + UINT_32 prtBlockIndex; ///< Index of a PRT tile (64K block) +} ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT; + +/** +**************************************************************************************************** +* Addr3ComputeSurfaceAddrFromCoord +* +* @brief +* Compute surface address from a given coordinate. +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceAddrFromCoord( + ADDR_HANDLE hLib, + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut); + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_PIPEBANKXOR_INPUT +* +* @brief +* Input structure of Addr3ComputePipebankXor +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_PIPEBANKXOR_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + UINT_32 surfIndex; ///< Input surface index + Addr3SwizzleMode swizzleMode; ///< Surface swizzle mode +} ADDR3_COMPUTE_PIPEBANKXOR_INPUT; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT +* +* @brief +* Output structure of Addr3ComputePipebankXor +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + UINT_32 pipeBankXor; ///< Pipe bank xor +} ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT; + +/** +**************************************************************************************************** +* Addr3ComputePipeBankXor +* +* @brief +* Calculate a valid bank pipe xor value for client to use. +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputePipeBankXor( + ADDR_HANDLE hLib, + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut); + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT +* +* @brief +* Input structure of Addr3ComputeNonBlockCompressedView +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + ADDR3_SURFACE_FLAGS flags; ///< Surface flags + Addr3SwizzleMode swizzleMode; ///< Swizzle Mode for Gfx12 + AddrResourceType resourceType; ///< Surface type + AddrFormat format; ///< Surface format + ADDR_EXTENT3D unAlignedDims; ///< Surface original dimensions (of mip0) + UINT_32 numMipLevels; ///< Total mipmap levels. + UINT_32 pipeBankXor; ///< Combined swizzle used to do bank/pipe rotation + UINT_32 slice; ///< Index of slice to view + UINT_32 mipId; ///< Id of mip to view +} ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT +* +* @brief +* Output structure of Addr3ComputeNonBlockCompressedView +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + UINT_64 offset; ///< Offset from resource base for the view + UINT_32 pipeBankXor; ///< Pipe bank xor for the view + ADDR_EXTENT3D unAlignedDims; ///< Mip0 dimens (in element) for the view + UINT_32 numMipLevels; ///< Total mipmap levels for the view + UINT_32 mipId; ///< Mip ID for the view +} ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT; + +/** +**************************************************************************************************** +* Addr3ComputeNonBlockCompressedView +* +* @brief +* Compute non-block-compressed view for a given mipmap level/slice +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeNonBlockCompressedView( + ADDR_HANDLE hLib, + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut); + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT +* +* @brief +* Input structure of Addr3ComputeSubResourceOffsetForSwizzlePattern +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + Addr3SwizzleMode swizzleMode; ///< Surface swizzle mode + AddrResourceType resourceType; ///< Surface resource type + UINT_32 pipeBankXor; ///< Per resource xor + UINT_32 slice; ///< Slice id + UINT_64 sliceSize; ///< Slice size of a mip chain + UINT_64 macroBlockOffset; ///< Macro block offset, returned in ADDR3_MIP_INFO + UINT_32 mipTailOffset; ///< Mip tail offset, returned in ADDR3_MIP_INFO +} ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT +* +* @brief +* Output structure of Addr3ComputeSubResourceOffsetForSwizzlePattern +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + UINT_64 offset; ///< offset +} ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT; + +/** +**************************************************************************************************** +* Addr3ComputeSubResourceOffsetForSwizzlePattern +* +* @brief +* Calculate sub resource offset to support swizzle pattern. +**************************************************************************************************** +*/ +VOID ADDR_API Addr3ComputeSubResourceOffsetForSwizzlePattern( + ADDR_HANDLE hLib, + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut); + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT +* +* @brief +* Input structure of Addr2ComputeSlicePipeBankXor +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT +{ + UINT_32 size; ///< Size of this structure in bytes + Addr3SwizzleMode swizzleMode; ///< Surface swizzle mode + AddrResourceType resourceType; ///< Surface resource type + UINT_32 bpe; ///< bits per element (e.g. block size for BCn format) + UINT_32 basePipeBankXor; ///< Base pipe bank xor + UINT_32 slice; ///< Slice id + UINT_32 numSamples; ///< Number of samples +} ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT; + +/** +**************************************************************************************************** +* ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT +* +* @brief +* Output structure of Addr3ComputeSlicePipeBankXor +**************************************************************************************************** +*/ +typedef struct _ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT +{ + UINT_32 size; ///< Size of this structure in bytes + UINT_32 pipeBankXor; ///< Pipe bank xor +} ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT; + +/** +**************************************************************************************************** +* Addr3ComputeSlicePipeBankXor +* +* @brief +* Calculate slice pipe bank xor value based on base pipe bank xor and slice id. +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeSlicePipeBankXor( + ADDR_HANDLE hLib, + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut); +} // namespace rocr #endif // __ADDR_INTERFACE_H__ diff --git a/src/image/addrlib/inc/addrtypes.h b/src/image/addrlib/inc/addrtypes.h index ccecc2473..aa1b48873 100644 --- a/src/image/addrlib/inc/addrtypes.h +++ b/src/image/addrlib/inc/addrtypes.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -292,6 +275,26 @@ typedef enum _AddrSwizzleMode ADDR_SW_256KB_R_X = ADDR_SW_MISCDEF31, } AddrSwizzleMode; +/** +**************************************************************************************************** +* @brief +* Neutral enums that define swizzle modes for Gfx12+ ASIC +* +**************************************************************************************************** +*/ +typedef enum _Addr3SwizzleMode +{ + ADDR3_LINEAR = 0, + ADDR3_256B_2D = 1, + ADDR3_4KB_2D = 2, + ADDR3_64KB_2D = 3, + ADDR3_256KB_2D = 4, + ADDR3_4KB_3D = 5, + ADDR3_64KB_3D = 6, + ADDR3_256KB_3D = 7, + ADDR3_MAX_TYPE = 8, +} Addr3SwizzleMode; + /** **************************************************************************************************** * @brief @@ -454,6 +457,7 @@ typedef enum _AddrFormat { ADDR_FMT_ASTC_12x12 = 0x0000004d, ADDR_FMT_ETC2_64BPP = 0x0000004e, ADDR_FMT_ETC2_128BPP = 0x0000004f, + ADDR_FMT_BG_RG_16_16_16_16 = 0x00000050, } AddrFormat; /** diff --git a/src/image/addrlib/src/addrinterface.cpp b/src/image/addrlib/src/addrinterface.cpp index d1ebf2680..0bc83678d 100644 --- a/src/image/addrlib/src/addrinterface.cpp +++ b/src/image/addrlib/src/addrinterface.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -32,6 +15,7 @@ #include "addrinterface.h" #include "addrlib1.h" #include "addrlib2.h" +#include "addrlib3.h" #include "addrcommon.h" @@ -1796,7 +1780,377 @@ ADDR_E_RETURNCODE ADDR_API Addr2IsValidDisplaySwizzleMode( returnCode = ADDR_ERROR; } - return returnCode; + return returnCode; +} + +/** +**************************************************************************************************** +* Addr2GetPossibleSwizzleModes +* +* @brief +* Returns a list of swizzle modes that are valid from the hardware's perspective for the +* client to choose from +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr2GetPossibleSwizzleModes( + ADDR_HANDLE hLib, ///< handle of addrlib + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, ///< [in] input + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) ///< [out] output +{ + ADDR_E_RETURNCODE returnCode; + + V2::Lib* pLib = V2::Lib::GetLib(hLib); + + if (pLib != NULL) + { + returnCode = pLib->GetPossibleSwizzleModes(pIn, pOut); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; +} +/** +**************************************************************************************************** +* Addr2GetAllowedBlockSet +* +* @brief +* Returns the set of allowed block sizes given the allowed swizzle modes and resource type +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedBlockSet( + ADDR_HANDLE hLib, ///< handle of addrlib + ADDR2_SWMODE_SET allowedSwModeSet, ///< [in] allowed swizzle modes + AddrResourceType rsrcType, ///< [in] resource type + ADDR2_BLOCK_SET* pAllowedBlockSet) ///< [out] allowed block sizes +{ + ADDR_E_RETURNCODE returnCode; + + V2::Lib* pLib = V2::Lib::GetLib(hLib); + + if (pLib != NULL) + { + returnCode = pLib->GetAllowedBlockSet(allowedSwModeSet, rsrcType, pAllowedBlockSet); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; +} + +/** +**************************************************************************************************** +* Addr2GetAllowedSwSet +* +* @brief +* Returns the set of allowed swizzle types given the allowed swizzle modes +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedSwSet( + ADDR_HANDLE hLib, ///< handle of addrlib + ADDR2_SWMODE_SET allowedSwModeSet, ///< [in] allowed swizzle modes + ADDR2_SWTYPE_SET* pAllowedSwSet) ///< [out] allowed swizzle types +{ + ADDR_E_RETURNCODE returnCode; + + V2::Lib* pLib = V2::Lib::GetLib(hLib); + + if (pLib != NULL) + { + returnCode = pLib->GetAllowedSwSet(allowedSwModeSet, pAllowedSwSet); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; +} + +/** +**************************************************************************************************** +* Addr2IsBlockTypeAvailable +* +* @brief +* Determine whether a block type is allowed in a given blockSet +**************************************************************************************************** +*/ +BOOL_32 Addr2IsBlockTypeAvailable( + ADDR2_BLOCK_SET blockSet, + AddrBlockType blockType) +{ + BOOL_32 avail; + + if (blockType == AddrBlockLinear) + { + avail = blockSet.linear ? TRUE : FALSE; + } + else + { + avail = blockSet.value & (1 << (static_cast(blockType) - 1)) ? TRUE : FALSE; + } + + return avail; +} + +/** +**************************************************************************************************** +* Addr2BlockTypeWithinMemoryBudget +* +* @brief +* Determine whether a new block type is acceptable based on memory waste ratio. Will favor +* larger block types. +**************************************************************************************************** +*/ +BOOL_32 Addr2BlockTypeWithinMemoryBudget( + UINT_64 minSize, + UINT_64 newBlockTypeSize, + UINT_32 ratioLow, + UINT_32 ratioHi, + DOUBLE memoryBudget, + BOOL_32 newBlockTypeBigger) +{ + BOOL_32 accept = FALSE; + + if (memoryBudget >= 1.0) + { + if (newBlockTypeBigger) + { + if ((static_cast(newBlockTypeSize) / minSize) <= memoryBudget) + { + accept = TRUE; + } + } + else + { + if ((static_cast(minSize) / newBlockTypeSize) > memoryBudget) + { + accept = TRUE; + } + } + } + else + { + if (newBlockTypeBigger) + { + if ((newBlockTypeSize * ratioHi) <= (minSize * ratioLow)) + { + accept = TRUE; + } + } + else + { + if ((newBlockTypeSize * ratioLow) < (minSize * ratioHi)) + { + accept = TRUE; + } + } + } + + return accept; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Surface functions for Addr3 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/** +**************************************************************************************************** +* Addr3ComputeSurfaceInfo +* +* @brief +* Calculate surface width/height/depth/alignments and suitable tiling mode +* +* @return +* ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceInfo( + ADDR_HANDLE hLib, ///< address lib handle + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] surface information + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) ///< [out] surface parameters and alignments +{ + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (pLib != NULL) + { + returnCode = pLib->ComputeSurfaceInfo(pIn, pOut); + } + + return returnCode; +} + +/** +**************************************************************************************************** +* Addr3GetPossibleSwizzleModes +* +* @brief +* Get valid swizzle mode options given image input for further optimal selection +* +* @return +* ADDR_OK if successful, otherwise an error code of ADDR_PARAMSIZEMISMATCH +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3GetPossibleSwizzleModes( + ADDR_HANDLE hLib, ///< address lib handle + const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT* pIn, ///< [in] surface information + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT* pOut) ///< [out] allowable swizzle mdoes +{ + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (pLib != NULL) + { + returnCode = pLib->GetPossibleSwizzleModes(pIn, pOut); + } + + return returnCode; +} + +/** +**************************************************************************************************** +* Addr3ComputeSurfaceAddrFromCoord +* +* @brief +* Compute surface address according to coordinates +* +* @return +* ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceAddrFromCoord( + ADDR_HANDLE hLib, ///< address lib handle + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ///< [in] surface info and coordinates + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) ///< [out] surface address +{ + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (pLib != NULL) + { + returnCode = pLib->ComputeSurfaceAddrFromCoord(pIn, pOut); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; } -} // namespace rocr \ No newline at end of file +/** +**************************************************************************************************** +* Addr3ComputePipeBankXor +* +* @brief +* Calculate a valid bank pipe xor value for client to use. +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputePipeBankXor( + ADDR_HANDLE hLib, ///< handle of addrlib + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, ///< [in] input + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) ///< [out] output +{ + ADDR_E_RETURNCODE returnCode; + + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + if (pLib != NULL) + { + returnCode = pLib->ComputePipeBankXor(pIn, pOut); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; +} + +/** +**************************************************************************************************** +* Addr3ComputeNonBlockCompressedView +* +* @brief +* Compute non-block-compressed view for a given mipmap level/slice. +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeNonBlockCompressedView( + ADDR_HANDLE hLib, ///< handle of addrlib + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, ///< [in] input + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut) ///< [out] output +{ + ADDR_E_RETURNCODE returnCode; + + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + if (pLib != NULL) + { + returnCode = pLib->ComputeNonBlockCompressedView(pIn, pOut); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; +} + +/** +**************************************************************************************************** +* Addr3ComputeSubResourceOffsetForSwizzlePattern +* +* @brief +* Calculate sub resource offset for swizzle pattern. +**************************************************************************************************** +*/ +VOID ADDR_API Addr3ComputeSubResourceOffsetForSwizzlePattern( + ADDR_HANDLE hLib, ///< handle of addrlib + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, ///< [in] input + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut) ///< [out] output +{ + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + if (pLib != NULL) + { + pLib->ComputeSubResourceOffsetForSwizzlePattern(pIn, pOut); + } +} + +/** +**************************************************************************************************** +* Addr3ComputeSlicePipeBankXor +* +* @brief +* Calculate slice pipe bank xor value based on base pipe bank xor and slice id. +**************************************************************************************************** +*/ +ADDR_E_RETURNCODE ADDR_API Addr3ComputeSlicePipeBankXor( + ADDR_HANDLE hLib, ///< handle of addrlib + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, ///< [in] input + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut) ///< [out] output +{ + ADDR_E_RETURNCODE returnCode; + + V3::Lib* pLib = V3::Lib::GetLib(hLib); + + if (pLib != NULL) + { + returnCode = pLib->ComputeSlicePipeBankXor(pIn, pOut); + } + else + { + returnCode = ADDR_ERROR; + } + + return returnCode; +} +} //namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/amdgpu_asic_addr.h b/src/image/addrlib/src/amdgpu_asic_addr.h index c384c138c..1909e56cb 100644 --- a/src/image/addrlib/src/amdgpu_asic_addr.h +++ b/src/image/addrlib/src/amdgpu_asic_addr.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -33,24 +16,23 @@ #define AMDGPU_VENDOR_IS_AMD(v) ((v == ATI_VENDOR_ID) || (v == AMD_VENDOR_ID)) #define FAMILY_UNKNOWN 0x00 -#define FAMILY_TN 0x69 -#define FAMILY_SI 0x6E -#define FAMILY_CI 0x78 -#define FAMILY_KV 0x7D -#define FAMILY_VI 0x82 -#define FAMILY_POLARIS 0x82 -#define FAMILY_CZ 0x87 -#define FAMILY_AI 0x8D -#define FAMILY_RV 0x8E -#define FAMILY_NV 0x8F -#define FAMILY_VGH 0x90 -#define FAMILY_GFX1100 0x91 -#define FAMILY_GFX1103 0x94 +#define FAMILY_TN 0x69 //# 105 / Trinity APUs +#define FAMILY_SI 0x6E //# 110 / Southern Islands: Tahiti, Pitcairn, CapeVerde, Oland, Hainan +#define FAMILY_CI 0x78 //# 120 / Sea Islands: Bonaire, Hawaii +#define FAMILY_KV 0x7D //# 125 / Kaveri APUs: Spectre, Spooky, Kalindi, Godavari +#define FAMILY_VI 0x82 //# 130 / Volcanic Islands: Iceland, Tonga, Fiji +#define FAMILY_CZ 0x87 //# 135 / Carrizo APUs: Carrizo, Stoney +#define FAMILY_AI 0x8D //# 141 / Vega: 10, 20 +#define FAMILY_RV 0x8E //# 142 / Raven +#define FAMILY_NV 0x8F //# 143 / Navi: 10 +#define FAMILY_VGH 0x90 //# 144 / Van Gogh +#define FAMILY_NV3 0x91 //# 145 / Navi: 3x #define FAMILY_GFX1150 0x96 -#define FAMILY_RMB 0x92 -#define FAMILY_GC_10_3_6 0x95 -#define FAMILY_GC_10_3_7 0x97 - +#define FAMILY_GFX1103 0x94 +#define FAMILY_RMB 0x92 //# 146 / Rembrandt +#define FAMILY_RPL 0x95 //# 149 / Raphael +#define FAMILY_MDN 0x97 //# 151 / Mendocino +#define FAMILY_GFX12 0x98 // AMDGPU_FAMILY_IS(familyId, familyName) #define FAMILY_IS(f, fn) (f == FAMILY_##fn) @@ -64,70 +46,72 @@ #define FAMILY_IS_AI(f) FAMILY_IS(f, AI) #define FAMILY_IS_RV(f) FAMILY_IS(f, RV) #define FAMILY_IS_NV(f) FAMILY_IS(f, NV) +#define FAMILY_IS_NV3(f) FAMILY_IS(f, NV3) #define FAMILY_IS_RMB(f) FAMILY_IS(f, RMB) -#define FAMILY_IS_GFX1100(f) FAMILY_IS(f, GFX1100) -#define FAMILY_IS_GFX1103(f) FAMILY_IS(f, GFX1103) -#define FAMILY_IS_GFX1150(f) FAMILY_IS(f, GFX1150) +#define FAMILY_IS_GFX12(f) FAMILY_IS(f, GFX12) #define AMDGPU_UNKNOWN 0xFF -#define AMDGPU_TAHITI_RANGE 0x05, 0x14 -#define AMDGPU_PITCAIRN_RANGE 0x15, 0x28 -#define AMDGPU_CAPEVERDE_RANGE 0x29, 0x3C -#define AMDGPU_OLAND_RANGE 0x3C, 0x46 -#define AMDGPU_HAINAN_RANGE 0x46, 0xFF +#define AMDGPU_TAHITI_RANGE 0x05, 0x14 //# 5 <= x < 20 +#define AMDGPU_PITCAIRN_RANGE 0x15, 0x28 //# 21 <= x < 40 +#define AMDGPU_CAPEVERDE_RANGE 0x29, 0x3C //# 41 <= x < 60 +#define AMDGPU_OLAND_RANGE 0x3C, 0x46 //# 60 <= x < 70 +#define AMDGPU_HAINAN_RANGE 0x46, 0xFF //# 70 <= x < max + +#define AMDGPU_BONAIRE_RANGE 0x14, 0x28 //# 20 <= x < 40 +#define AMDGPU_HAWAII_RANGE 0x28, 0x3C //# 40 <= x < 60 + +#define AMDGPU_SPECTRE_RANGE 0x01, 0x41 //# 1 <= x < 65 +#define AMDGPU_SPOOKY_RANGE 0x41, 0x81 //# 65 <= x < 129 +#define AMDGPU_KALINDI_RANGE 0x81, 0xA1 //# 129 <= x < 161 +#define AMDGPU_GODAVARI_RANGE 0xA1, 0xFF //# 161 <= x < max -#define AMDGPU_BONAIRE_RANGE 0x14, 0x28 -#define AMDGPU_HAWAII_RANGE 0x28, 0x3C +#define AMDGPU_ICELAND_RANGE 0x01, 0x14 //# 1 <= x < 20 +#define AMDGPU_TONGA_RANGE 0x14, 0x28 //# 20 <= x < 40 +#define AMDGPU_FIJI_RANGE 0x3C, 0x50 //# 60 <= x < 80 -#define AMDGPU_SPECTRE_RANGE 0x01, 0x41 -#define AMDGPU_SPOOKY_RANGE 0x41, 0x81 -#define AMDGPU_KALINDI_RANGE 0x81, 0xA1 -#define AMDGPU_GODAVARI_RANGE 0xA1, 0xFF +#define AMDGPU_POLARIS10_RANGE 0x50, 0x5A //# 80 <= x < 90 +#define AMDGPU_POLARIS11_RANGE 0x5A, 0x64 //# 90 <= x < 100 +#define AMDGPU_POLARIS12_RANGE 0x64, 0x6E //# 100 <= x < 110 +#define AMDGPU_VEGAM_RANGE 0x6E, 0xFF //# 110 <= x < max -#define AMDGPU_ICELAND_RANGE 0x01, 0x14 -#define AMDGPU_TONGA_RANGE 0x14, 0x28 -#define AMDGPU_FIJI_RANGE 0x3C, 0x50 +#define AMDGPU_CARRIZO_RANGE 0x01, 0x21 //# 1 <= x < 33 +#define AMDGPU_BRISTOL_RANGE 0x10, 0x21 //# 16 <= x < 33 +#define AMDGPU_STONEY_RANGE 0x61, 0xFF //# 97 <= x < max -#define AMDGPU_POLARIS10_RANGE 0x50, 0x5A -#define AMDGPU_POLARIS11_RANGE 0x5A, 0x64 -#define AMDGPU_POLARIS12_RANGE 0x64, 0x6E -#define AMDGPU_VEGAM_RANGE 0x6E, 0xFF +#define AMDGPU_VEGA10_RANGE 0x01, 0x14 //# 1 <= x < 20 +#define AMDGPU_VEGA12_RANGE 0x14, 0x28 //# 20 <= x < 40 +#define AMDGPU_VEGA20_RANGE 0x28, 0xFF //# 40 <= x < max -#define AMDGPU_CARRIZO_RANGE 0x01, 0x21 -#define AMDGPU_STONEY_RANGE 0x61, 0xFF +#define AMDGPU_RAVEN_RANGE 0x01, 0x81 //# 1 <= x < 129 +#define AMDGPU_RAVEN2_RANGE 0x81, 0x90 //# 129 <= x < 144 +#define AMDGPU_RENOIR_RANGE 0x91, 0xFF //# 145 <= x < max -#define AMDGPU_VEGA10_RANGE 0x01, 0x14 -#define AMDGPU_VEGA12_RANGE 0x14, 0x28 -#define AMDGPU_VEGA20_RANGE 0x28, 0x32 -#define AMDGPU_ARCTURUS_RANGE 0x32, 0x3C -#define AMDGPU_ALDEBARAN_RANGE 0x3C, 0xFF +#define AMDGPU_NAVI10_RANGE 0x01, 0x0A //# 1 <= x < 10 +#define AMDGPU_NAVI12_RANGE 0x0A, 0x14 //# 10 <= x < 20 +#define AMDGPU_NAVI14_RANGE 0x14, 0x28 //# 20 <= x < 40 +#define AMDGPU_NAVI21_RANGE 0x28, 0x32 //# 40 <= x < 50 +#define AMDGPU_NAVI22_RANGE 0x32, 0x3C //# 50 <= x < 60 +#define AMDGPU_NAVI23_RANGE 0x3C, 0x46 //# 60 <= x < 70 +#define AMDGPU_NAVI24_RANGE 0x46, 0x50 //# 70 <= x < 80 -#define AMDGPU_RAVEN_RANGE 0x01, 0x81 -#define AMDGPU_RAVEN2_RANGE 0x81, 0x91 -#define AMDGPU_RENOIR_RANGE 0x91, 0xFF +#define AMDGPU_VANGOGH_RANGE 0x01, 0xFF //# 1 <= x < max -#define AMDGPU_NAVI10_RANGE 0x01, 0x0A -#define AMDGPU_NAVI12_RANGE 0x0A, 0x14 -#define AMDGPU_NAVI14_RANGE 0x14, 0x28 -#define AMDGPU_NAVI21_RANGE 0x28, 0x32 -#define AMDGPU_NAVI22_RANGE 0x32, 0x3C -#define AMDGPU_NAVI23_RANGE 0x3C, 0x46 -#define AMDGPU_NAVI24_RANGE 0x46, 0x50 +#define AMDGPU_NAVI31_RANGE 0x01, 0x10 //# 01 <= x < 16 +#define AMDGPU_NAVI32_RANGE 0x20, 0xFF //# 32 <= x < 255 +#define AMDGPU_NAVI33_RANGE 0x10, 0x20 //# 16 <= x < 32 +#define AMDGPU_GFX1103_R1_RANGE 0x01, 0x80 //# 1 <= x < 128 +#define AMDGPU_GFX1103_R2_RANGE 0x80, 0xC0 //# 128 <= x < 192 -#define AMDGPU_VANGOGH_RANGE 0x01, 0xFF +#define AMDGPU_GFX1150_RANGE 0x01, 0xFF //# 1 <= x < max -#define AMDGPU_GFX1100_RANGE 0x01, 0x10 -#define AMDGPU_GFX1101_RANGE 0x20, 0xFF -#define AMDGPU_GFX1102_RANGE 0x10, 0x20 -#define AMDGPU_GFX1103_RANGE 0x01, 0xFF -#define AMDGPU_GFX1150_RANGE 0x01, 0xFF +#define AMDGPU_REMBRANDT_RANGE 0x01, 0xFF //# 01 <= x < 255 -#define AMDGPU_REMBRANDT_RANGE 0x01, 0xFF +#define AMDGPU_RAPHAEL_RANGE 0x01, 0xFF //# 1 <= x < max -#define AMDGPU_GFX1036_RANGE 0x01, 0xFF +#define AMDGPU_MENDOCINO_RANGE 0x01, 0xFF //# 1 <= x < max -#define AMDGPU_GFX1037_RANGE 0x01, 0xFF +#define AMDGPU_GFX12_TBD1_RANGE 0x40, 0xFF //# 64 <= x < max #define AMDGPU_EXPAND_FIX(x) x #define AMDGPU_RANGE_HELPER(val, min, max) ((val >= min) && (val < max)) @@ -160,6 +144,7 @@ #define ASICREV_IS_VEGAM_P(r) ASICREV_IS(r, VEGAM) #define ASICREV_IS_CARRIZO(r) ASICREV_IS(r, CARRIZO) +#define ASICREV_IS_CARRIZO_BRISTOL(r) ASICREV_IS(r, BRISTOL) #define ASICREV_IS_STONEY(r) ASICREV_IS(r, STONEY) #define ASICREV_IS_VEGA10_M(r) ASICREV_IS(r, VEGA10) @@ -167,8 +152,6 @@ #define ASICREV_IS_VEGA12_P(r) ASICREV_IS(r, VEGA12) #define ASICREV_IS_VEGA12_p(r) ASICREV_IS(r, VEGA12) #define ASICREV_IS_VEGA20_P(r) ASICREV_IS(r, VEGA20) -#define ASICREV_IS_ARCTURUS(r) ASICREV_IS(r, ARCTURUS) -#define ASICREV_IS_ALDEBARAN(r) ASICREV_IS(r, ALDEBARAN) #define ASICREV_IS_RAVEN(r) ASICREV_IS(r, RAVEN) #define ASICREV_IS_RAVEN2(r) ASICREV_IS(r, RAVEN2) @@ -190,16 +173,20 @@ #define ASICREV_IS_VANGOGH(r) ASICREV_IS(r, VANGOGH) -#define ASICREV_IS_GFX1100(r) ASICREV_IS(r, GFX1100) -#define ASICREV_IS_GFX1101(r) ASICREV_IS(r, GFX1101) -#define ASICREV_IS_GFX1102(r) ASICREV_IS(r, GFX1102) -#define ASICREV_IS_GFX1103(r) ASICREV_IS(r, GFX1103) +#define ASICREV_IS_NAVI31_P(r) ASICREV_IS(r, NAVI31) +#define ASICREV_IS_NAVI32_P(r) ASICREV_IS(r, NAVI32) +#define ASICREV_IS_NAVI33_P(r) ASICREV_IS(r, NAVI33) +#define ASICREV_IS_GFX1150(r) ASICREV_IS(r, GFX1150) +#define ASICREV_IS_GFX1103_R1(r) ASICREV_IS(r, GFX1103_R1) +#define ASICREV_IS_GFX1103_R2(r) ASICREV_IS(r, GFX1103_R2) #define ASICREV_IS_GFX1150(r) ASICREV_IS(r, GFX1150) #define ASICREV_IS_REMBRANDT(r) ASICREV_IS(r, REMBRANDT) -#define ASICREV_IS_GFX1036(r) ASICREV_IS(r, GFX1036) +#define ASICREV_IS_RAPHAEL(r) ASICREV_IS(r, RAPHAEL) + +#define ASICREV_IS_MENDOCINO(r) ASICREV_IS(r, MENDOCINO) -#define ASICREV_IS_GFX1037(r) ASICREV_IS(r, GFX1037) +#define ASICREV_IS_GFX12_TBD1_P(r) ASICREV_IS(r, GFX12_TBD1) #endif // _AMDGPU_ASIC_ADDR_H diff --git a/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h b/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h index 7383c4e06..9f0521c1f 100644 --- a/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h +++ b/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ diff --git a/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h b/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h index 99a66c08d..12ab84da8 100644 --- a/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h +++ b/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -74,3 +57,4 @@ union GB_ADDR_CONFIG_GFX11 }; #endif + diff --git a/src/image/addrlib/src/chip/gfx12/gfx12_gb_reg.h b/src/image/addrlib/src/chip/gfx12/gfx12_gb_reg.h new file mode 100644 index 000000000..389b3871d --- /dev/null +++ b/src/image/addrlib/src/chip/gfx12/gfx12_gb_reg.h @@ -0,0 +1,57 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2007-2023 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ + +#if !defined (__GFX12_GB_REG_H__) +#define __GFX12_GB_REG_H__ + +/* +* gfx12_gb_reg.h +* +* Register Spec Release: 1.0 +* +*/ + +// +// Make sure the necessary endian defines are there. +// +#if defined(LITTLEENDIAN_CPU) +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +union GB_ADDR_CONFIG_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_PIPES : 3; + unsigned int PIPE_INTERLEAVE_SIZE : 3; + unsigned int MAX_COMPRESSED_FRAGS : 2; + unsigned int NUM_PKRS : 3; + unsigned int : 8; + unsigned int NUM_SHADER_ENGINES : 4; + unsigned int : 3; + unsigned int NUM_RB_PER_SE : 2; + unsigned int : 4; +#elif defined(BIGENDIAN_CPU) + unsigned int : 4; + unsigned int NUM_RB_PER_SE : 2; + unsigned int : 3; + unsigned int NUM_SHADER_ENGINES : 4; + unsigned int : 8; + unsigned int NUM_PKRS : 3; + unsigned int MAX_COMPRESSED_FRAGS : 2; + unsigned int PIPE_INTERLEAVE_SIZE : 3; + unsigned int NUM_PIPES : 3; +#endif + } bitfields, bits; + unsigned int u32All; + int i32All; + float f32All; +}; + +#endif \ No newline at end of file diff --git a/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h b/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h index b0be682cc..8ff6939ab 100644 --- a/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h +++ b/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ diff --git a/src/image/addrlib/src/chip/r800/si_gb_reg.h b/src/image/addrlib/src/chip/r800/si_gb_reg.h index 3f5f4071e..c5bb578f9 100644 --- a/src/image/addrlib/src/chip/r800/si_gb_reg.h +++ b/src/image/addrlib/src/chip/r800/si_gb_reg.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -93,9 +76,52 @@ #endif +#if defined(LITTLEENDIAN_CPU) + + typedef struct _GB_ADDR_CONFIG_N { + unsigned int num_pipes : 3; + unsigned int pipe_interleave_size : 3; + unsigned int max_compressed_frags : 2; + unsigned int bank_interleave_size : 3; + unsigned int : 1; + unsigned int num_banks : 3; + unsigned int : 1; + unsigned int shader_engine_tile_size : 3; + unsigned int num_shader_engines : 2; + unsigned int num_gpus : 3; + unsigned int multi_gpu_tile_size : 2; + unsigned int num_rb_per_se : 2; + unsigned int row_size : 2; + unsigned int num_lower_pipes : 1; + unsigned int se_enable : 1; + } GB_ADDR_CONFIG_N; + +#elif defined(BIGENDIAN_CPU) + + typedef struct _GB_ADDR_CONFIG_N { + unsigned int se_enable : 1; + unsigned int num_lower_pipes : 1; + unsigned int row_size : 2; + unsigned int num_rb_per_se : 2; + unsigned int multi_gpu_tile_size : 2; + unsigned int num_gpus : 3; + unsigned int num_shader_engines : 2; + unsigned int shader_engine_tile_size : 3; + unsigned int : 1; + unsigned int num_banks : 3; + unsigned int : 1; + unsigned int bank_interleave_size : 3; + unsigned int max_compressed_frags : 2; + unsigned int pipe_interleave_size : 3; + unsigned int num_pipes : 3; + } GB_ADDR_CONFIG_N; + +#endif + typedef union { unsigned int val : 32; GB_ADDR_CONFIG_T f; + GB_ADDR_CONFIG_N n; } GB_ADDR_CONFIG; #if defined(LITTLEENDIAN_CPU) diff --git a/src/image/addrlib/src/core/addrcommon.h b/src/image/addrlib/src/core/addrcommon.h index 6b8fa0a5c..894892574 100644 --- a/src/image/addrlib/src/core/addrcommon.h +++ b/src/image/addrlib/src/core/addrcommon.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -42,9 +25,13 @@ #endif #if defined(__GNUC__) + #include #include #endif +#if defined(_WIN32) +#include +#endif //////////////////////////////////////////////////////////////////////////////////////////////////// // Platform specific debug break defines @@ -89,7 +76,13 @@ #else #define ADDR_ASSERT(__e) if ( !((__e) ? TRUE : FALSE)) { ADDR_DBG_BREAK(); } #endif - #define ADDR_ASSERT_ALWAYS() ADDR_DBG_BREAK() + + #if ADDR_SILENCE_ASSERT_ALWAYS + #define ADDR_ASSERT_ALWAYS() + #else + #define ADDR_ASSERT_ALWAYS() ADDR_DBG_BREAK() + #endif + #define ADDR_UNHANDLED_CASE() ADDR_ASSERT(!"Unhandled case") #define ADDR_NOT_IMPLEMENTED() ADDR_ASSERT(!"Not implemented"); #else //DEBUG @@ -192,9 +185,11 @@ #endif namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ //////////////////////////////////////////////////////////////////////////////////////////////////// // Common constants //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -318,6 +313,49 @@ static inline UINT_32 XorReduce( return result; } +/** +**************************************************************************************************** +* Unset least bit +* +* @brief +* Returns a copy of the value with the least-significant '1' bit unset +**************************************************************************************************** +*/ +static inline UINT_32 UnsetLeastBit( + UINT_32 val) +{ + return val & (val - 1); +} + +/** +**************************************************************************************************** +* BitScanForward +* +* @brief +* Returns the index-position of the least-significant '1' bit. Must not be 0. +**************************************************************************************************** +*/ +static inline UINT_32 BitScanForward( + UINT_32 mask) ///< [in] Bitmask to scan +{ + ADDR_ASSERT(mask > 0); + unsigned long out = 0; +#if (defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64)) + out = ::_tzcnt_u32(mask); +#elif (defined(_WIN32) || defined(_WIN64)) + ::_BitScanForward(&out, mask); +#elif defined(__GNUC__) + out = __builtin_ctz(mask); +#else + while ((mask & 1) == 0) + { + mask >>= 1; + out++; + } +#endif + return out; +} + /** **************************************************************************************************** * IsPow2 @@ -974,6 +1012,37 @@ static inline UINT_32 GetCoordActiveMask( return mask; } +/** +**************************************************************************************************** +* FillEqBitComponents +* +* @brief +* Fill the 'numBitComponents' field based on the equation. +**************************************************************************************************** +*/ +static inline void FillEqBitComponents( + ADDR_EQUATION *pEquation) // [in/out] Equation to calculate bit components for +{ + pEquation->numBitComponents = 1; // We always have at least the address + for (UINT_32 xorN = 1; xorN < ADDR_MAX_EQUATION_COMP; xorN++) + { + for (UINT_32 bit = 0; bit < ADDR_MAX_EQUATION_BIT; bit++) + { + if (pEquation->comps[xorN][bit].valid) + { + pEquation->numBitComponents = xorN + 1; + break; + } + } + + if (pEquation->numBitComponents != (xorN + 1)) + { + // Skip following components if this one wasn't valid + break; + } + } +} + /** **************************************************************************************************** * ShiftCeil @@ -1005,7 +1074,7 @@ static inline UINT_32 ShiftRight( } } // Addr -} // rocr +} // namespace rocr #endif // __ADDR_COMMON_H__ diff --git a/src/image/addrlib/src/core/addrelemlib.cpp b/src/image/addrlib/src/core/addrelemlib.cpp index b3bff74fc..615d8f0e3 100644 --- a/src/image/addrlib/src/core/addrelemlib.cpp +++ b/src/image/addrlib/src/core/addrelemlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -34,7 +17,8 @@ #include "addrlib.h" namespace rocr { -namespace Addr { +namespace Addr +{ /** **************************************************************************************************** @@ -1407,6 +1391,10 @@ UINT_32 ElemLib::GetBitsPerPixel( case ADDR_FMT_24_8: bpp = 32; break; + case ADDR_FMT_BG_RG_16_16_16_16: + elemMode = ADDR_PACKED_BGRG; + bpp = 32; + break; case ADDR_FMT_16_16_16_16: case ADDR_FMT_32_32: case ADDR_FMT_CTX1: @@ -1818,6 +1806,7 @@ BOOL_32 ElemLib::IsMacroPixelPacked( { case ADDR_FMT_BG_RG: case ADDR_FMT_GB_GR: + case ADDR_FMT_BG_RG_16_16_16_16: isMacroPixelPacked = TRUE; break; default: @@ -1827,5 +1816,5 @@ BOOL_32 ElemLib::IsMacroPixelPacked( return isMacroPixelPacked; } -} // Addr -} // rocr +} +} //namespace rocr diff --git a/src/image/addrlib/src/core/addrelemlib.h b/src/image/addrlib/src/core/addrelemlib.h index 308c9844b..3352279de 100644 --- a/src/image/addrlib/src/core/addrelemlib.h +++ b/src/image/addrlib/src/core/addrelemlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -39,7 +22,8 @@ #include "addrcommon.h" namespace rocr { -namespace Addr { +namespace Addr +{ class Lib; @@ -273,8 +257,7 @@ class ElemLib : public Object Addr::Lib* const m_pAddrLib; ///< Pointer to parent addrlib instance }; -} // Addr -} // rocr - +} //Addr +} //namespace rocr #endif diff --git a/src/image/addrlib/src/core/addrlib.cpp b/src/image/addrlib/src/core/addrlib.cpp index a958cd11e..d7d322f8d 100644 --- a/src/image/addrlib/src/core/addrlib.cpp +++ b/src/image/addrlib/src/core/addrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -81,7 +64,8 @@ UINT_32 __umoddi3(UINT_64 n, UINT_32 base) #endif // __APPLE__ namespace rocr { -namespace Addr { +namespace Addr +{ //////////////////////////////////////////////////////////////////////////////////////////////////// // Constructor/Destructor @@ -228,15 +212,18 @@ ADDR_E_RETURNCODE Lib::Create( case FAMILY_NV: case FAMILY_VGH: case FAMILY_RMB: - case FAMILY_GC_10_3_6: - case FAMILY_GC_10_3_7: + case FAMILY_RPL: + case FAMILY_MDN: pLib = Gfx10HwlInit(&client); break; - case FAMILY_GFX1100: - case FAMILY_GFX1103: + case FAMILY_NV3: case FAMILY_GFX1150: + case FAMILY_GFX1103: pLib = Gfx11HwlInit(&client); break; + case FAMILY_GFX12: + pLib = Gfx12HwlInit(&client); + break; default: ADDR_ASSERT_ALWAYS(); break; @@ -247,7 +234,10 @@ ADDR_E_RETURNCODE Lib::Create( break; } } - + if(pLib == NULL) + { + returnCode = ADDR_OUTOFMEMORY; + } if (pLib != NULL) { BOOL_32 initValid; @@ -286,6 +276,7 @@ ADDR_E_RETURNCODE Lib::Create( { delete pLib; pLib = NULL; + returnCode = ADDR_OUTOFMEMORY; ADDR_ASSERT_ALWAYS(); } else @@ -305,12 +296,6 @@ ADDR_E_RETURNCODE Lib::Create( pLib->SetMaxAlignments(); } - else if ((pLib == NULL) && - (returnCode == ADDR_OK)) - { - // Unknown failures, we return the general error code - returnCode = ADDR_ERROR; - } return returnCode; } @@ -673,4 +658,4 @@ UINT_32 Lib::GetBpe(AddrFormat format) const } } // Addr -} // rocr +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/core/addrlib.h b/src/image/addrlib/src/core/addrlib.h index 0d16762a5..cce002ab1 100644 --- a/src/image/addrlib/src/core/addrlib.h +++ b/src/image/addrlib/src/core/addrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -34,6 +17,7 @@ #define __ADDR_LIB_H__ #include "addrinterface.h" +#include "addrtypes.h" #include "addrobject.h" #include "addrelemlib.h" @@ -56,7 +40,8 @@ #endif namespace rocr { -namespace Addr { +namespace Addr +{ /** **************************************************************************************************** @@ -266,7 +251,7 @@ class Lib : public Object } /// Returns asic chip family name defined by AddrLib - ChipFamily GetChipFamily() + ChipFamily GetChipFamily() const { return m_chipFamily; } @@ -316,6 +301,21 @@ class Lib : public Object #endif } + static BOOL_32 IsTex1d(AddrResourceType resourceType) + { + return (resourceType == ADDR_RSRC_TEX_1D); + } + + static BOOL_32 IsTex2d(AddrResourceType resourceType) + { + return (resourceType == ADDR_RSRC_TEX_2D); + } + + static BOOL_32 IsTex3d(AddrResourceType resourceType) + { + return (resourceType == ADDR_RSRC_TEX_3D); + } + // // Initialization // @@ -408,7 +408,7 @@ Lib* CiHwlInit (const Client* pClient); Lib* Gfx9HwlInit (const Client* pClient); Lib* Gfx10HwlInit(const Client* pClient); Lib* Gfx11HwlInit(const Client* pClient); +Lib* Gfx12HwlInit(const Client* pClient); } // Addr -} // rocr - +} // namespace rocr #endif diff --git a/src/image/addrlib/src/core/addrlib1.cpp b/src/image/addrlib/src/core/addrlib1.cpp index c6ab5b3da..c99d0af0a 100644 --- a/src/image/addrlib/src/core/addrlib1.cpp +++ b/src/image/addrlib/src/core/addrlib1.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -35,8 +18,10 @@ #include "addrcommon.h" namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ //////////////////////////////////////////////////////////////////////////////////////////////////// // Static Const Member @@ -2994,6 +2979,7 @@ ADDR_E_RETURNCODE Lib::ComputeMicroTileEquation( // stackedDepthSlices is used for addressing mode that a tile block contains multiple slices, // which is not supported by our address lib pEquation->stackedDepthSlices = FALSE; + pEquation->numBitComponents = 1; return retCode; } @@ -4070,4 +4056,4 @@ ADDR_E_RETURNCODE Lib::ComputePrtInfo( } // V1 } // Addr -} // namespace rocr +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/core/addrlib1.h b/src/image/addrlib/src/core/addrlib1.h index a6b7fe32d..8b5dde206 100644 --- a/src/image/addrlib/src/core/addrlib1.h +++ b/src/image/addrlib/src/core/addrlib1.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -37,8 +20,10 @@ #include "addrlib.h" namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ /** **************************************************************************************************** diff --git a/src/image/addrlib/src/core/addrlib2.cpp b/src/image/addrlib/src/core/addrlib2.cpp index e23029100..43d4c0f9d 100644 --- a/src/image/addrlib/src/core/addrlib2.cpp +++ b/src/image/addrlib/src/core/addrlib2.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -36,8 +19,10 @@ #include "addrcommon.h" namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ //////////////////////////////////////////////////////////////////////////////////////////////////// // Static Const Member @@ -302,6 +287,12 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo( if (localIn.flags.needEquation && (Log2(localIn.numFrags) == 0)) { pOut->equationIndex = GetEquationIndex(&localIn, pOut); + if ((localIn.flags.allowExtEquation == 0) && + (pOut->equationIndex != ADDR_INVALID_EQUATION_INDEX) && + (m_equationTable[pOut->equationIndex].numBitComponents > ADDR_MAX_LEGACY_EQUATION_COMP)) + { + pOut->equationIndex = ADDR_INVALID_EQUATION_INDEX; + } } if (localIn.flags.qbStereo) @@ -1177,6 +1168,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear( ADDR2_COMPUTE_SURFACE_INFO_INPUT localIn = {0}; ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0}; ADDR2_MIP_INFO mipInfo[MaxMipLevels]; + ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels); localIn.bpp = pIn->bpp; localIn.flags = pIn->flags; @@ -1852,6 +1844,61 @@ ADDR_E_RETURNCODE Lib::Addr2GetPreferredSurfaceSetting( return returnCode; } +/** +************************************************************************************************************************ +* Lib::GetPossibleSwizzleModes +* +* @brief +* Returns a list of swizzle modes that are valid from the hardware's perspective for the client to choose from +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::GetPossibleSwizzleModes( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const +{ + return HwlGetPossibleSwizzleModes(pIn, pOut); +} + +/** +************************************************************************************************************************ +* Lib::GetAllowedBlockSet +* +* @brief +* Returns the set of allowed block sizes given the allowed swizzle modes and resource type +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::GetAllowedBlockSet( + ADDR2_SWMODE_SET allowedSwModeSet, + AddrResourceType rsrcType, + ADDR2_BLOCK_SET* pAllowedBlockSet) const +{ + return HwlGetAllowedBlockSet(allowedSwModeSet, rsrcType, pAllowedBlockSet); +} + +/** +************************************************************************************************************************ +* Lib::GetAllowedSwSet +* +* @brief +* Returns the set of allowed swizzle types given the allowed swizzle modes +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::GetAllowedSwSet( + ADDR2_SWMODE_SET allowedSwModeSet, + ADDR2_SWTYPE_SET* pAllowedSwSet) const +{ + return HwlGetAllowedSwSet(allowedSwModeSet, pAllowedSwSet); +} + /** ************************************************************************************************************************ * Lib::ComputeBlock256Equation @@ -2000,7 +2047,8 @@ VOID Lib::ComputeQbStereoInfo( VOID Lib::FilterInvalidEqSwizzleMode( ADDR2_SWMODE_SET& allowedSwModeSet, AddrResourceType resourceType, - UINT_32 elemLog2 + UINT_32 elemLog2, + UINT_32 maxComponents ) const { if (resourceType != ADDR_RSRC_TEX_1D) @@ -2013,7 +2061,12 @@ VOID Lib::FilterInvalidEqSwizzleMode( { if (validSwModeSet & 1) { - if (m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2] == ADDR_INVALID_EQUATION_INDEX) + UINT_32 equation = m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2]; + if (equation == ADDR_INVALID_EQUATION_INDEX) + { + allowedSwModeSetVal &= ~(1u << swModeIdx); + } + else if (m_equationTable[equation].numBitComponents > maxComponents) { allowedSwModeSetVal &= ~(1u << swModeIdx); } @@ -2030,94 +2083,6 @@ VOID Lib::FilterInvalidEqSwizzleMode( } } -/** -************************************************************************************************************************ -* Lib::IsBlockTypeAvaiable -* -* @brief -* Determine whether a block type is allowed in a given blockSet -* -* @return -* N/A -************************************************************************************************************************ -*/ -BOOL_32 Lib::IsBlockTypeAvaiable( - ADDR2_BLOCK_SET blockSet, - AddrBlockType blockType) -{ - BOOL_32 avail; - - if (blockType == AddrBlockLinear) - { - avail = blockSet.linear ? TRUE : FALSE; - } - else - { - avail = blockSet.value & (1 << (static_cast(blockType) - 1)) ? TRUE : FALSE; - } - - return avail; -} - -/** -************************************************************************************************************************ -* Lib::BlockTypeWithinMemoryBudget -* -* @brief -* Determine whether a new block type is acceptible based on memory waste ratio -* -* @return -* N/A -************************************************************************************************************************ -*/ -BOOL_32 Lib::BlockTypeWithinMemoryBudget( - UINT_64 minSize, - UINT_64 newBlockTypeSize, - UINT_32 ratioLow, - UINT_32 ratioHi, - DOUBLE memoryBudget, - BOOL_32 newBlockTypeBigger) -{ - BOOL_32 accept = FALSE; - - if (memoryBudget >= 1.0) - { - if (newBlockTypeBigger) - { - if ((static_cast(newBlockTypeSize) / minSize) <= memoryBudget) - { - accept = TRUE; - } - } - else - { - if ((static_cast(minSize) / newBlockTypeSize) > memoryBudget) - { - accept = TRUE; - } - } - } - else - { - if (newBlockTypeBigger) - { - if ((newBlockTypeSize * ratioHi) <= (minSize * ratioLow)) - { - accept = TRUE; - } - } - else - { - if ((newBlockTypeSize * ratioLow) < (minSize * ratioHi)) - { - accept = TRUE; - } - } - } - - return accept; -} - #if DEBUG /** ************************************************************************************************************************ @@ -2195,4 +2160,5 @@ VOID Lib::ValidateStereoInfo( } // V2 } // Addr -} // rocr +} // namespace rocr + diff --git a/src/image/addrlib/src/core/addrlib2.h b/src/image/addrlib/src/core/addrlib2.h index 118306674..5abf58f03 100644 --- a/src/image/addrlib/src/core/addrlib2.h +++ b/src/image/addrlib/src/core/addrlib2.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -37,8 +20,10 @@ #include "addrlib.h" namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ /** ************************************************************************************************************************ @@ -147,6 +132,8 @@ union ADDR_BIT_SETTING * @brief Swizzle pattern information ************************************************************************************************************************ */ +// Accessed by index representing the logbase2 of (8bpp/16bpp/32bpp/64bpp/128bpp) +// contains the indices which map to 2D arrays SW_PATTERN_NIBBLE[0-9] which contain sections of an index equation. They are dependant on pipe# and bpe # struct ADDR_SW_PATINFO { UINT_8 maxItemCount; @@ -305,6 +292,10 @@ class Lib : public Addr::Lib const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const; + ADDR_E_RETURNCODE GetPossibleSwizzleModes( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const; + virtual BOOL_32 IsValidDisplaySwizzleMode( const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const { @@ -312,11 +303,21 @@ class Lib : public Addr::Lib return ADDR_NOTIMPLEMENTED; } + ADDR_E_RETURNCODE GetAllowedBlockSet( + ADDR2_SWMODE_SET allowedSwModeSet, + AddrResourceType rsrcType, + ADDR2_BLOCK_SET* pAllowedBlockSet) const; + + ADDR_E_RETURNCODE GetAllowedSwSet( + ADDR2_SWMODE_SET allowedSwModeSet, + ADDR2_SWTYPE_SET* pAllowedSwSet) const; + protected: Lib(); // Constructor is protected Lib(const Client* pClient); static const UINT_32 MaxNumOfBpp = 5; + static const UINT_32 MaxNumOfBppCMask = 4; static const UINT_32 MaxNumOfAA = 4; static const Dim2d Block256_2d[MaxNumOfBpp]; @@ -669,6 +670,31 @@ class Lib : public Addr::Lib return ADDR_NOTSUPPORTED; } + virtual ADDR_E_RETURNCODE HwlGetPossibleSwizzleModes( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTSUPPORTED; + } + + virtual ADDR_E_RETURNCODE HwlGetAllowedBlockSet( + ADDR2_SWMODE_SET allowedSwModeSet, + AddrResourceType rsrcType, + ADDR2_BLOCK_SET* pAllowedBlockSet) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTIMPLEMENTED; + } + + virtual ADDR_E_RETURNCODE HwlGetAllowedSwSet( + ADDR2_SWMODE_SET allowedSwModeSet, + ADDR2_SWTYPE_SET* pAllowedSwSet) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTIMPLEMENTED; + } + virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfoSanityCheck( const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const { @@ -922,17 +948,8 @@ class Lib : public Addr::Lib VOID FilterInvalidEqSwizzleMode( ADDR2_SWMODE_SET& allowedSwModeSet, AddrResourceType resourceType, - UINT_32 elemLog2) const; - - static BOOL_32 IsBlockTypeAvaiable(ADDR2_BLOCK_SET blockSet, AddrBlockType blockType); - - static BOOL_32 BlockTypeWithinMemoryBudget( - UINT_64 minSize, - UINT_64 newBlockTypeSize, - UINT_32 ratioLow, - UINT_32 ratioHi, - DOUBLE memoryBudget = 0.0f, - BOOL_32 newBlockTypeBigger = TRUE); + UINT_32 elemLog2, + UINT_32 maxComponents) const; #if DEBUG VOID ValidateStereoInfo( @@ -982,7 +999,6 @@ class Lib : public Addr::Lib } // V2 } // Addr -} // rocr - +} // namespace rocr #endif diff --git a/src/image/addrlib/src/core/addrlib3.cpp b/src/image/addrlib/src/core/addrlib3.cpp new file mode 100644 index 000000000..a2e6c3605 --- /dev/null +++ b/src/image/addrlib/src/core/addrlib3.cpp @@ -0,0 +1,1073 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ + + +/** +************************************************************************************************************************ +* @file addrlib3.cpp +* @brief Contains the implementation for the AddrLib3 base class. +************************************************************************************************************************ +*/ + +#include "addrinterface.h" +#include "addrlib3.h" +#include "addrcommon.h" + +namespace rocr { +namespace Addr +{ +namespace V3 +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Static Const Member +//////////////////////////////////////////////////////////////////////////////////////////////////// + +const Dim2d Lib::Block256_2d[] = {{16, 16}, {16, 8}, {8, 8}, {8, 4}, {4, 4}}; + +const ADDR_EXTENT3D Lib::Block1K_3d[] = {{16, 8, 8}, {8, 8, 8}, {8, 8, 4}, {8, 4, 4}, {4, 4, 4}}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Constructor/Destructor +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/** +************************************************************************************************************************ +* Lib::Lib +* +* @brief +* Constructor for the Addr::V3::Lib class +* +************************************************************************************************************************ +*/ +Lib::Lib() + : + Addr::Lib(), + m_pipesLog2(0), + m_pipeInterleaveLog2(0), + m_numEquations(0) +{ + Init(); +} + +/** +************************************************************************************************************************ +* Lib::Lib +* +* @brief +* Constructor for the AddrLib3 class with hClient as parameter +* +************************************************************************************************************************ +*/ +Lib::Lib( + const Client* pClient) + : + Addr::Lib(pClient), + m_pipesLog2(0), + m_pipeInterleaveLog2(0), + m_numEquations(0) +{ + Init(); +} + +/** +************************************************************************************************************************ +* Lib::Init +* +* @brief +* Initialization of class +* +************************************************************************************************************************ +*/ +void Lib::Init() +{ + memset(m_equationTable, 0, sizeof(m_equationTable)); + + // There is no equation table entry for linear, so start at the "next" swizzle mode entry. + for (UINT_32 swizzleModeIdx = ADDR3_LINEAR + 1; swizzleModeIdx < ADDR3_MAX_TYPE; swizzleModeIdx++) + { + for (UINT_32 msaaRateIdx = 0; msaaRateIdx < MaxMsaaRateLog2; msaaRateIdx++) + { + for (UINT_32 log2BytesIdx = 0; log2BytesIdx < MaxElementBytesLog2; log2BytesIdx++) + { + SetEquationTableEntry(static_cast(swizzleModeIdx), + msaaRateIdx, + log2BytesIdx, + ADDR_INVALID_EQUATION_INDEX); + } + } + } +} + +/** +************************************************************************************************************************ +* Lib::~Lib +* +* @brief +* Destructor for the AddrLib2 class +* +************************************************************************************************************************ +*/ +Lib::~Lib() +{ +} + +/** +************************************************************************************************************************ +* Lib::GetLib +* +* @brief +* Get Addr::V3::Lib pointer +* +* @return +* An Addr::V2::Lib class pointer +************************************************************************************************************************ +*/ +Lib* Lib::GetLib( + ADDR_HANDLE hLib) ///< [in] handle of ADDR_HANDLE +{ + Addr::Lib* pAddrLib = Addr::Lib::GetLib(hLib); + + return static_cast(hLib); +} + +/** +************************************************************************************************************************ +* Lib::GetBlockSize +* +* @brief +* Returns the byte size of a block for the swizzle mode. +* +* @return +* Byte size of the block, zero if swizzle mode is invalid. +************************************************************************************************************************ +*/ +UINT_32 Lib::GetBlockSize( + Addr3SwizzleMode swizzleMode, + BOOL_32 forPitch + ) const +{ + return (1 << GetBlockSizeLog2(swizzleMode, forPitch)); +} + +/** +************************************************************************************************************************ +* Lib::GetBlockSizeLog2 +* +* @brief +* Returns the log2 of the byte size of a block for the swizzle mode. +* +* @return +* Byte size of the block, zero if swizzle mode is invalid. +************************************************************************************************************************ +*/ +UINT_32 Lib::GetBlockSizeLog2( + Addr3SwizzleMode swizzleMode, + BOOL_32 forPitch + ) const +{ + UINT_32 blockSize = 0; + + switch (swizzleMode) + { + case ADDR3_256B_2D: + blockSize = 8; + break; + case ADDR3_4KB_2D: + case ADDR3_4KB_3D: + blockSize = 12; + break; + case ADDR3_64KB_2D: + case ADDR3_64KB_3D: + blockSize = 16; + break; + case ADDR3_256KB_2D: + case ADDR3_256KB_3D: + blockSize = 18; + break; + case ADDR3_LINEAR: + blockSize = (forPitch ? 7 : 8); + break; + default: + ADDR_ASSERT_ALWAYS(); + break; + } + + return blockSize; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSurfaceInfo +* +* @brief +* Interface function stub of ComputeSurfaceInfo. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (GetFillSizeFieldsFlags() == TRUE) + { + if ((pIn->size != sizeof(ADDR3_COMPUTE_SURFACE_INFO_INPUT)) || + (pOut->size != sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT))) + { + returnCode = ADDR_PARAMSIZEMISMATCH; + } + } + + // Adjust incoming parameters. + ADDR3_COMPUTE_SURFACE_INFO_INPUT localIn = *pIn; + localIn.width = Max(pIn->width, 1u); + localIn.height = Max(pIn->height, 1u); + localIn.numMipLevels = Max(pIn->numMipLevels, 1u); + localIn.numSlices = Max(pIn->numSlices, 1u); + localIn.numSamples = Max(pIn->numSamples, 1u); + + UINT_32 expandX = 1; + UINT_32 expandY = 1; + ElemMode elemMode = ADDR_UNCOMPRESSED; + + if (returnCode == ADDR_OK) + { + // Set format to INVALID will skip this conversion + if (localIn.format != ADDR_FMT_INVALID) + { + // Get compression/expansion factors and element mode which indicates compression/expansion + localIn.bpp = GetElemLib()->GetBitsPerPixel(localIn.format, + &elemMode, + &expandX, + &expandY); + + // Special flag for 96 bit surface. 96 (or 48 if we support) bit surface's width is + // pre-multiplied by 3 and bpp is divided by 3. So pitch alignment for linear- + // aligned does not meet 64-pixel in real. We keep special handling in hwl since hw + // restrictions are different. + // Also Mip 1+ needs an element pitch of 32 bits so we do not need this workaround + // but we use this flag to skip RestoreSurfaceInfo below + if ((elemMode == ADDR_EXPANDED) && (expandX > 1)) + { + ADDR_ASSERT(IsLinear(localIn.swizzleMode)); + } + + UINT_32 basePitch = 0; + GetElemLib()->AdjustSurfaceInfo(elemMode, + expandX, + expandY, + &localIn.bpp, + &basePitch, + &localIn.width, + &localIn.height); + + // Overwrite these parameters if we have a valid format + } + + if (localIn.bpp != 0) + { + localIn.width = Max(localIn.width, 1u); + localIn.height = Max(localIn.height, 1u); + } + else // Rule out some invalid parameters + { + returnCode = ADDR_INVALIDPARAMS; + } + } + + if (returnCode == ADDR_OK) + { + returnCode = HwlComputeSurfaceInfo(&localIn, pOut); + + if (returnCode == ADDR_OK) + { + pOut->bpp = localIn.bpp; + pOut->pixelPitch = pOut->pitch; + pOut->pixelHeight = pOut->height; + + if (localIn.format != ADDR_FMT_INVALID) + { + UINT_32 pixelBits = pOut->pixelBits; + + GetElemLib()->RestoreSurfaceInfo(elemMode, + expandX, + expandY, + &pOut->pixelBits, + &pOut->pixelPitch, + &pOut->pixelHeight); + + GetElemLib()->RestoreSurfaceInfo(elemMode, + expandX, + expandY, + &pixelBits, + &pOut->pixelMipChainPitch, + &pOut->pixelMipChainHeight); + + if ((localIn.numMipLevels > 1) && (pOut->pMipInfo != NULL)) + { + for (UINT_32 i = 0; i < localIn.numMipLevels; i++) + { + pOut->pMipInfo[i].pixelPitch = pOut->pMipInfo[i].pitch; + pOut->pMipInfo[i].pixelHeight = pOut->pMipInfo[i].height; + + GetElemLib()->RestoreSurfaceInfo(elemMode, + expandX, + expandY, + &pixelBits, + &pOut->pMipInfo[i].pixelPitch, + &pOut->pMipInfo[i].pixelHeight); + } + } + } + } + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::GetPossibleSwizzleModes +* +* @brief +* Interface function stub of AddrComputeSurfaceInfo. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::GetPossibleSwizzleModes( + const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT* pIn, ///< [in] input structure + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (GetFillSizeFieldsFlags() == TRUE) + { + if ((pIn->size != sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT)) || + (pOut->size != sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT))) + { + returnCode = ADDR_PARAMSIZEMISMATCH; + } + } + + if (returnCode == ADDR_OK) + { + const ADDR3_SURFACE_FLAGS flags = pIn->flags; + + // VRS images can only be 2D from the client API rules. + ADDR_ASSERT((flags.isVrsImage == 0) || IsTex2d(pIn->resourceType)); + + if (pIn->bpp == 96) + { + pOut->validModes.swLinear = 1; + } + // Depth/Stencil images can't be linear and must be 2D swizzle modes. + // These three are related to DB block that supports only SW_64KB_2D and SW_256KB_2D for DSV. + else if (flags.depth || flags.stencil) + { + pOut->validModes.sw2d64kB = 1; + pOut->validModes.sw2d256kB = 1; + } + // The organization of elements in the hierarchical surface is the same as any other surface, and it can support + // any 2D swizzle mode (SW_256_2D, SW_4KB_2D, SW_64KB_2D, or SW_256KB_2D). The swizzle mode can be selected + // orthogonally to the underlying z or stencil surface. + else if (pIn->flags.hiZHiS) + { + pOut->validModes.sw2d256B = 1; + pOut->validModes.sw2d4kB = 1; + pOut->validModes.sw2d64kB = 1; + pOut->validModes.sw2d256kB = 1; + } + // MSAA can't be linear and must be 2D swizzle modes. + else if (pIn->numSamples > 1) + { + // NOTE: SW_256B_2D still supports MSAA. The removal of 256B for MSAA is reverted in HW Doc. + pOut->validModes.sw2d256B = 1; + pOut->validModes.sw2d4kB = 1; + pOut->validModes.sw2d64kB = 1; + pOut->validModes.sw2d256kB = 1; + } + // Block-compressed images need to be either using 2D or linear swizzle modes. + else if (flags.blockCompressed) + { + pOut->validModes.swLinear = 1; + + // We find cases where Tex3d BlockCompressed image adopts 2D_256B should be prohibited. + if (IsTex3d(pIn->resourceType) == FALSE) + { + pOut->validModes.sw2d256B = 1; + } + pOut->validModes.sw2d4kB = 1; + pOut->validModes.sw2d64kB = 1; + pOut->validModes.sw2d256kB = 1; + } + else if (IsTex1d(pIn->resourceType)) + { + pOut->validModes.swLinear = 1; + pOut->validModes.sw2d256B = 1; + pOut->validModes.sw2d4kB = 1; + pOut->validModes.sw2d64kB = 1; + pOut->validModes.sw2d256kB = 1; + } + else if (flags.nv12 || flags.p010 || IsTex2d(pIn->resourceType) || flags.view3dAs2dArray) + { + // NV12 and P010 support + // SW_LINEAR, SW_256B_2D, SW_4KB_2D, SW_64KB_2D, SW_256KB_2D + // There could be more multimedia formats that require more hw specific tiling modes... + + // The exception is VRS images. + // Linear is not allowed and the VRS surface needs to be 8BPP format. + if (flags.isVrsImage) + { + ADDR_ASSERT(pIn->bpp == 8); + } + else + { + pOut->validModes.swLinear = 1; + } + if (flags.view3dAs2dArray == 0) + { + // ADDR3_256B_2D can't support 3D images. + pOut->validModes.sw2d256B = 1; + } + pOut->validModes.sw2d4kB = 1; + pOut->validModes.sw2d64kB = 1; + pOut->validModes.sw2d256kB = 1; + } + else if (IsTex3d(pIn->resourceType)) + { + // An eventual determination would be based on pal setting of height_watermark and depth_watermark. + // However, we just adopt the simpler logic currently. + // For 3D images w/ view3dAs2dArray = 0, SW_3D is preferred. + // For 3D images w/ view3dAs2dArray = 1, it should go to 2D path above. + // Enable linear since client may force linear tiling for 3D texture that does not set view3dAs2dArray. + pOut->validModes.swLinear = 1; + pOut->validModes.sw3d4kB = 1; + pOut->validModes.sw3d64kB = 1; + pOut->validModes.sw3d256kB = 1; + } + } + + constexpr UINT_32 Size256 = 256u; + constexpr UINT_32 Size4K = 4 * 1024; + constexpr UINT_32 Size64K = 64 * 1024; + constexpr UINT_32 Size256K = 256 * 1024; + + ADDR_ASSERT(pIn->maxAlign != 0); + + if (pIn->maxAlign < Size256K) + { + pOut->validModes.value &= ~Gfx12Blk256KBSwModeMask; + } + + if (pIn->maxAlign < Size64K) + { + pOut->validModes.value &= ~Gfx12Blk64KBSwModeMask; + } + + if (pIn->maxAlign < Size4K) + { + pOut->validModes.value &= ~Gfx12Blk4KBSwModeMask; + } + + if (pIn->maxAlign < Size256) + { + pOut->validModes.value &= ~Gfx12Blk256BSwModeMask; + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::HwlConvertChipFamily +* +* @brief +* Convert familyID defined in atiid.h to ChipFamily and set m_chipFamily/m_chipRevision +* @return +* ChipFamily +************************************************************************************************************************ +*/ +ChipFamily Lib::HwlConvertChipFamily( + UINT_32 chipFamily, ///< [in] chip family defined in atiih.h + UINT_32 chipRevision) ///< [in] chip revision defined in "asic_family"_id.h +{ + return ADDR_CHIP_FAMILY_NAVI; +} + +/** +************************************************************************************************************************ +* Lib::ComputeBlockDimensionForSurf +* +* @brief +* Internal function to get block width/height/depth in element from surface input params. +* +* @return +* VOID +************************************************************************************************************************ +*/ +VOID Lib::ComputeBlockDimensionForSurf( + ADDR_EXTENT3D* pExtent, + UINT_32 bpp, + UINT_32 numSamples, + Addr3SwizzleMode swizzleMode + ) const +{ + const UINT_32 eleBytes = bpp >> 3; + const UINT_32 log2EleBytes = Log2(eleBytes); + const UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); + + if (IsLinear(swizzleMode)) + { + pExtent->width = 1 << (log2BlkSize - log2EleBytes); + pExtent->height = 1; + pExtent->depth = 1; + } + else if (Is3dSwizzle(swizzleMode)) + { + const UINT_32 base = (log2BlkSize / 3) - (log2EleBytes / 3); + const UINT_32 log2BlkSizeMod3 = log2BlkSize % 3; + const UINT_32 log2EleBytesMod3 = log2EleBytes % 3; + + UINT_32 x = base; + UINT_32 y = base; + UINT_32 z = base; + + if (log2BlkSizeMod3 > 0) + { + x++; + } + + if (log2BlkSizeMod3 > 1) + { + z++; + } + + if (log2EleBytesMod3 > 0) + { + x--; + } + + if (log2EleBytesMod3 > 1) + { + z--; + } + + pExtent->width = 1u << x; + pExtent->height = 1u << y; + pExtent->depth = 1u << z; + } + else + { + const UINT_32 log2Samples = Log2(Max(numSamples, 1u)); + const UINT_32 log2Width = (log2BlkSize >> 1) - + (log2EleBytes >> 1) - + (log2Samples >> 1) - + (log2EleBytes & log2Samples & 1); + const UINT_32 log2Height = (log2BlkSize >> 1) - + (log2EleBytes >> 1) - + (log2Samples >> 1) - + ((log2EleBytes | log2Samples) & 1); + + // Return the extent in actual units, not log2 + pExtent->width = 1u << log2Width; + pExtent->height = 1u << log2Height; + pExtent->depth = 1; + } +} + +/** +************************************************************************************************************************ +* Lib::GetMipTailDim +* +* @brief +* Internal function to get out max dimension of first level in mip tail +* +* @return +* Max Width/Height/Depth value of the first mip fitted in mip tail +************************************************************************************************************************ +*/ +ADDR_EXTENT3D Lib::GetMipTailDim( + Addr3SwizzleMode swizzleMode, + const ADDR_EXTENT3D& blockDims + ) const +{ + const UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode); + + ADDR_EXTENT3D out = blockDims; + + if (Is3dSwizzle(swizzleMode)) + { + const UINT_32 dim = log2BlkSize % 3; + + if (dim == 0) + { + out.height >>= 1; + } + else if (dim == 1) + { + out.width >>= 1; + } + else + { + out.depth >>= 1; + } + } + else + { + if ((log2BlkSize % 2) == 0) + { + out.width >>= 1; + } + else + { + out.height >>= 1; + } + } + + return out; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSurfaceAddrFromCoord +* +* @brief +* Interface function stub of ComputeSurfaceAddrFromCoord. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoord( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (GetFillSizeFieldsFlags() == TRUE) + { + if ((pIn->size != sizeof(ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT)) || + (pOut->size != sizeof(ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT))) + { + returnCode = ADDR_PARAMSIZEMISMATCH; + } + } + + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT localIn = *pIn; + localIn.unAlignedDims.width = Max(pIn->unAlignedDims.width, 1u); + localIn.unAlignedDims.height = Max(pIn->unAlignedDims.height, 1u); + localIn.unAlignedDims.depth = Max(pIn->unAlignedDims.depth, 1u); + localIn.numMipLevels = Max(pIn->numMipLevels, 1u); + localIn.numSamples = Max(pIn->numSamples, 1u); + + if ((localIn.bpp < 8) || + (localIn.bpp > 128) || + ((localIn.bpp % 8) != 0) || + (localIn.sample >= localIn.numSamples) || + (localIn.slice >= localIn.unAlignedDims.depth) || + (localIn.mipId >= localIn.numMipLevels) || + (IsTex3d(localIn.resourceType) && + (Valid3DMipSliceIdConstraint(localIn.unAlignedDims.depth, localIn.mipId, localIn.slice) == FALSE))) + { + returnCode = ADDR_INVALIDPARAMS; + } + + if (returnCode == ADDR_OK) + { + if (IsLinear(localIn.swizzleMode)) + { + returnCode = ComputeSurfaceAddrFromCoordLinear(&localIn, pOut); + } + else + { + returnCode = ComputeSurfaceAddrFromCoordTiled(&localIn, pOut); + } + + if (returnCode == ADDR_OK) + { + pOut->prtBlockIndex = static_cast(pOut->addr / (64 * 1024)); + } + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSurfaceAddrFromCoord +* +* @brief +* Interface function stub of Addr3ComputePipeBankXor. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputePipeBankXor( + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) +{ + ADDR_E_RETURNCODE returnCode; + + if ((GetFillSizeFieldsFlags() == TRUE) && + ((pIn->size != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_INPUT)) || + (pOut->size != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT)))) + { + returnCode = ADDR_INVALIDPARAMS; + } + else + { + returnCode = HwlComputePipeBankXor(pIn, pOut); + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSurfaceAddrFromCoordLinear +* +* @brief +* Internal function to calculate address from coord for linear swizzle surface +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + BOOL_32 valid = (pIn->numSamples <= 1); + + if (valid) + { + if (IsTex1d(pIn->resourceType)) + { + valid = (pIn->y == 0); + } + } + + if (valid) + { + ADDR3_COMPUTE_SURFACE_INFO_INPUT localIn = {0}; + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0}; + ADDR3_MIP_INFO mipInfo[MaxMipLevels]; + ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels); + + localIn.size = sizeof(localIn); + localIn.flags = pIn->flags; + localIn.swizzleMode = ADDR3_LINEAR; + localIn.resourceType = pIn->resourceType; + localIn.format = ADDR_FMT_INVALID; + localIn.bpp = pIn->bpp; + localIn.width = Max(pIn->unAlignedDims.width, 1u); + localIn.height = Max(pIn->unAlignedDims.height, 1u); + localIn.numSlices = Max(pIn->unAlignedDims.depth, 1u); + localIn.numMipLevels = Max(pIn->numMipLevels, 1u); + localIn.numSamples = Max(pIn->numSamples, 1u); + + if (localIn.numMipLevels <= 1) + { + localIn.pitchInElement = pIn->pitchInElement; + } + + localOut.size = sizeof(localOut); + localOut.pMipInfo = mipInfo; + + returnCode = ComputeSurfaceInfo(&localIn, &localOut); + + if (returnCode == ADDR_OK) + { + pOut->addr = (localOut.sliceSize * pIn->slice) + + mipInfo[pIn->mipId].offset + + (pIn->y * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3); + pOut->bitPosition = 0; + } + else + { + valid = FALSE; + } + } + + if (valid == FALSE) + { + returnCode = ADDR_INVALIDPARAMS; + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSurfaceAddrFromCoordTiled +* +* @brief +* Internal function to calculate address from coord for tiled swizzle surface +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordTiled( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut ///< [out] output structure + ) const +{ + return HwlComputeSurfaceAddrFromCoordTiled(pIn, pOut); +} + +/** +************************************************************************************************************************ +* Lib::ComputeNonBlockCompressedView +* +* @brief +* Interface function stub of Addr3ComputeNonBlockCompressedView. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeNonBlockCompressedView( + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut) +{ + ADDR_E_RETURNCODE returnCode; + + if ((GetFillSizeFieldsFlags() == TRUE) && + ((pIn->size != sizeof(ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT)) || + (pOut->size != sizeof(ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT)))) + { + returnCode = ADDR_INVALIDPARAMS; + } + else if (Is3dSwizzle(pIn->swizzleMode)) + { + // 3D volume images using ADDR3_XX_3D is currently not supported. + returnCode = ADDR_NOTSUPPORTED; + } + else + { + returnCode = HwlComputeNonBlockCompressedView(pIn, pOut); + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSubResourceOffsetForSwizzlePattern +* +* @brief +* Interface function stub of Addr3ComputeSubResourceOffsetForSwizzlePattern. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeSubResourceOffsetForSwizzlePattern( + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut) +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if ((GetFillSizeFieldsFlags() == TRUE) && + ((pIn->size != sizeof(ADDR2_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT)) || + (pOut->size != sizeof(ADDR2_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT)))) + { + returnCode = ADDR_INVALIDPARAMS; + } + else + { + HwlComputeSubResourceOffsetForSwizzlePattern(pIn, pOut); + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::ComputeSlicePipeBankXor +* +* @brief +* Interface function stub of Addr3ComputeSlicePipeBankXor. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ComputeSlicePipeBankXor( + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut) +{ + ADDR_E_RETURNCODE returnCode; + + if ((GetFillSizeFieldsFlags() == TRUE) && + ((pIn->size != sizeof(ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT)) || + (pOut->size != sizeof(ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT)))) + { + returnCode = ADDR_INVALIDPARAMS; + } + if ((pIn->bpe != 0) && + (pIn->bpe != 8) && + (pIn->bpe != 16) && + (pIn->bpe != 32) && + (pIn->bpe != 64) && + (pIn->bpe != 128)) + { + returnCode = ADDR_INVALIDPARAMS; + } + else + { + returnCode = HwlComputeSlicePipeBankXor(pIn, pOut); + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Lib::UseCustomHeight +* +* @brief +* Determines if the calculations for this surface should use minimal HW values or user-specified values. +* +* @return +* Returns TRUE if the user-specified alignment should be used +************************************************************************************************************************ +*/ +BOOL_32 Lib::UseCustomHeight( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn + ) const +{ + return ((pIn->numMipLevels <= 1) && + IsLinear(pIn->swizzleMode) && + (pIn->sliceAlign > 0)); +} + +/** +************************************************************************************************************************ +* Lib::UseCustomPitch +* +* @brief +* Determines if the calculations for this surface should use minimal HW values or user-specified values. +* +* @return +* Returns TRUE if the user-specified pitch should be used +************************************************************************************************************************ +*/ +BOOL_32 Lib::UseCustomPitch( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn + ) const +{ + return ((pIn->numMipLevels <= 1) && + IsLinear(pIn->swizzleMode) && + (pIn->pitchInElement > 0)); +} + +/** +************************************************************************************************************************ +* Lib::CanTrimLinearPadding +* +* @brief +* Determines if the calculations for this surface can omit extra trailing padding for linear surfaces. +* +* @return +* Returns TRUE if the trailing padding can be omitted. +************************************************************************************************************************ +*/ +BOOL_32 Lib::CanTrimLinearPadding( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn + ) const +{ + return ((IsTex3d(pIn->resourceType) == FALSE) && + (pIn->numSlices <= 1) && + IsLinear(pIn->swizzleMode)); +} + +/** +************************************************************************************************************************ +* Lib::ApplyCustomizedPitchHeight +* +* @brief +* Helper function to override hw required row pitch/slice pitch by customrized one +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Lib::ApplyCustomizedPitchHeight( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + const UINT_32 elementBytes = pIn->bpp >> 3; + + // Calculate the default pitch/height without any user inputs + pOut->pitch = PowTwoAlign(pIn->width, pOut->blockExtent.width); + pOut->height = PowTwoAlign(pIn->height, pOut->blockExtent.height); + + // Custom pitches / alignments are only possible with single mip level / linear images; otherwise, + // ignore those parameters. + if (UseCustomPitch(pIn)) + { + const UINT_32 pitchAlignmentBytes = 1 << GetBlockSizeLog2(pIn->swizzleMode, TRUE); + const UINT_32 pitchAlignmentElements = pitchAlignmentBytes / elementBytes; + + // Their requested pitch has to meet the pitch alignment constraints applied by the HW. + if ((pIn->pitchInElement % pitchAlignmentElements) != 0) + { + returnCode = ADDR_INVALIDPARAMS; + } + // And their pitch can't be less than the minimum + else if (pIn->pitchInElement < pOut->pitch) + { + returnCode = ADDR_INVALIDPARAMS; + } + else + { + pOut->pitch = pIn->pitchInElement; + } + } + + if ((returnCode == ADDR_OK) && UseCustomHeight(pIn)) + { + UINT_32 customizedHeight = pIn->sliceAlign / elementBytes / pOut->pitch; + + if (customizedHeight * elementBytes * pOut->pitch != pIn->sliceAlign) + { + returnCode = ADDR_INVALIDPARAMS; + } + else if ((pIn->numSlices > 1) && (pOut->height != customizedHeight)) + { + returnCode = ADDR_INVALIDPARAMS; + } + else + { + pOut->height = customizedHeight; + } + } + + return returnCode; +} + +} // V3 +} // Addr +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/core/addrlib3.h b/src/image/addrlib/src/core/addrlib3.h new file mode 100644 index 000000000..6b7a1818b --- /dev/null +++ b/src/image/addrlib/src/core/addrlib3.h @@ -0,0 +1,417 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ + + +/** +************************************************************************************************************************ +* @file addrlib3.h +* @brief Contains the Addr::V3::Lib class definition. +************************************************************************************************************************ +*/ + +#ifndef __ADDR3_LIB3_H__ +#define __ADDR3_LIB3_H__ + +#include "addrlib.h" + +namespace rocr { +namespace Addr +{ +namespace V3 +{ + +/** +************************************************************************************************************************ +* @brief Bitmasks for swizzle mode determination on GFX12 +************************************************************************************************************************ +*/ +const UINT_32 Gfx12Blk256KBSwModeMask = (1u << ADDR3_256KB_2D) | + (1u << ADDR3_256KB_3D); + +const UINT_32 Gfx12Blk64KBSwModeMask = (1u << ADDR3_64KB_2D) | + (1u << ADDR3_64KB_3D); + +const UINT_32 Gfx12Blk4KBSwModeMask = (1u << ADDR3_4KB_2D) | + (1u << ADDR3_4KB_3D); + +const UINT_32 Gfx12Blk256BSwModeMask = (1u << ADDR3_256B_2D); + +/** +************************************************************************************************************************ +* @brief Bit setting for swizzle pattern +************************************************************************************************************************ +*/ +union ADDR_BIT_SETTING +{ + struct + { + UINT_16 x; + UINT_16 y; + UINT_16 z; + UINT_16 s; + }; + UINT_64 value; +}; + +/** +************************************************************************************************************************ +* @brief Flags for SwizzleModeTable +************************************************************************************************************************ +*/ +union SwizzleModeFlags +{ + struct + { + // Swizzle mode + UINT_32 isLinear : 1; // Linear + UINT_32 is2d : 1; // 2d mode + UINT_32 is3d : 1; // 3d mode + + // Block size + UINT_32 is256b : 1; // Block size is 256B + UINT_32 is4kb : 1; // Block size is 4KB + UINT_32 is64kb : 1; // Block size is 64KB + UINT_32 is256kb : 1; // Block size is 256KB + + UINT_32 reserved : 25; // Reserved bits + }; + + UINT_32 u32All; +}; + +struct Dim2d +{ + UINT_32 w; + UINT_32 h; +}; + +const UINT_32 Log2Size256 = 8u; +const UINT_32 Log2Size4K = 12u; +const UINT_32 Log2Size64K = 16u; +const UINT_32 Log2Size256K = 18u; + +/** +************************************************************************************************************************ +* @brief Swizzle pattern information +************************************************************************************************************************ +*/ +// Accessed by index representing the logbase2 of (8bpp/16bpp/32bpp/64bpp/128bpp) +// contains the indices which map to 2D arrays SW_PATTERN_NIBBLE[1-4] which contain sections of an index equation. +struct ADDR_SW_PATINFO +{ + UINT_8 nibble1Idx; + UINT_8 nibble2Idx; + UINT_8 nibble3Idx; + UINT_8 nibble4Idx; +}; + +/** +************************************************************************************************************************ +* InitBit +* +* @brief +* Initialize bit setting value via a return value +************************************************************************************************************************ +*/ +#define InitBit(c, index) (1ull << ((c << 4) + index)) + +const UINT_64 X0 = InitBit(0, 0); +const UINT_64 X1 = InitBit(0, 1); +const UINT_64 X2 = InitBit(0, 2); +const UINT_64 X3 = InitBit(0, 3); +const UINT_64 X4 = InitBit(0, 4); +const UINT_64 X5 = InitBit(0, 5); +const UINT_64 X6 = InitBit(0, 6); +const UINT_64 X7 = InitBit(0, 7); +const UINT_64 X8 = InitBit(0, 8); + +const UINT_64 Y0 = InitBit(1, 0); +const UINT_64 Y1 = InitBit(1, 1); +const UINT_64 Y2 = InitBit(1, 2); +const UINT_64 Y3 = InitBit(1, 3); +const UINT_64 Y4 = InitBit(1, 4); +const UINT_64 Y5 = InitBit(1, 5); +const UINT_64 Y6 = InitBit(1, 6); +const UINT_64 Y7 = InitBit(1, 7); +const UINT_64 Y8 = InitBit(1, 8); + +const UINT_64 Z0 = InitBit(2, 0); +const UINT_64 Z1 = InitBit(2, 1); +const UINT_64 Z2 = InitBit(2, 2); +const UINT_64 Z3 = InitBit(2, 3); +const UINT_64 Z4 = InitBit(2, 4); +const UINT_64 Z5 = InitBit(2, 5); + +const UINT_64 S0 = InitBit(3, 0); +const UINT_64 S1 = InitBit(3, 1); +const UINT_64 S2 = InitBit(3, 2); + +/** +************************************************************************************************************************ +* @brief Bit setting for swizzle pattern +************************************************************************************************************************ +*/ + +/** +************************************************************************************************************************ +* @brief This class contains asic independent address lib functionalities +************************************************************************************************************************ +*/ +class Lib : public Addr::Lib +{ +public: + virtual ~Lib(); + + static Lib* GetLib( + ADDR_HANDLE hLib); + + // + // Interface stubs + // + + // For data surface + ADDR_E_RETURNCODE ComputeSurfaceInfo( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; + + ADDR_E_RETURNCODE GetPossibleSwizzleModes( + const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT* pIn, + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT* pOut) const; + + ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoord( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; + + // Misc + ADDR_E_RETURNCODE ComputePipeBankXor( + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut); + + ADDR_E_RETURNCODE ComputeNonBlockCompressedView( + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut); + + ADDR_E_RETURNCODE ComputeSubResourceOffsetForSwizzlePattern( + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut); + + ADDR_E_RETURNCODE ComputeSlicePipeBankXor( + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut); + +protected: + Lib(); // Constructor is protected + Lib(const Client* pClient); + + static const UINT_32 MaxImageDim = 65536; + static const UINT_32 MaxMipLevels = 17; // Max image size is 64k + static const UINT_32 MaxNumOfBpp = 5; + static const UINT_32 MaxNumOfAA = 4; + UINT_32 m_pipesLog2; ///< Number of pipe per shader engine Log2 + UINT_32 m_pipeInterleaveLog2; ///< Log2 of pipe interleave bytes + + static const Dim2d Block256_2d[MaxNumOfBpp]; + static const ADDR_EXTENT3D Block1K_3d[MaxNumOfBpp]; + SwizzleModeFlags m_swizzleModeTable[ADDR3_MAX_TYPE]; ///< Swizzle mode table + + // Number of unique MSAA sample rates (1/2/4/8) + static const UINT_32 MaxMsaaRateLog2 = 4; + // Max number of bpp (8bpp/16bpp/32bpp/64bpp/128bpp) + static const UINT_32 MaxElementBytesLog2 = 5; + // Number of unique swizzle patterns (one entry per swizzle mode + MSAA + bpp configuration) + static const UINT_32 NumSwizzlePatterns = 19 * MaxElementBytesLog2; + + // Number of equation entries in the table + UINT_32 m_numEquations; + // Equation lookup table according to swizzle mode, MSAA sample rate, and bpp + UINT_32 m_equationLookupTable[ADDR3_MAX_TYPE - 1][MaxMsaaRateLog2][MaxElementBytesLog2]; + + // Equation table + ADDR_EQUATION m_equationTable[NumSwizzlePatterns]; + + void SetEquationTableEntry( + Addr3SwizzleMode addrType, + UINT_32 msaaLog2, + UINT_32 elementLog2, + UINT_32 value) + { + m_equationLookupTable[addrType - 1][msaaLog2][elementLog2] = value; + } + + const UINT_32 GetEquationTableEntry( + Addr3SwizzleMode addrType, + UINT_32 msaaLog2, + UINT_32 elementLog2) const + { + return m_equationLookupTable[addrType - 1][msaaLog2][elementLog2]; + } + + static BOOL_32 Valid3DMipSliceIdConstraint( + UINT_32 numSlices, + UINT_32 mipId, + UINT_32 slice) + { + return (Max((numSlices >> mipId), 1u) > slice); + } + + UINT_32 GetBlockSize( + Addr3SwizzleMode swizzleMode, + BOOL_32 forPitch = FALSE) const; + + UINT_32 GetBlockSizeLog2( + Addr3SwizzleMode swizzleMode, + BOOL_32 forPitch = FALSE) const; + + BOOL_32 IsValidSwMode(Addr3SwizzleMode swizzleMode) const + { + return (m_swizzleModeTable[swizzleMode].u32All != 0); + } + + UINT_32 IsLinear(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].isLinear; + } + + // Checking block size + BOOL_32 IsBlock256b(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].is256b; + } + + // Checking block size + BOOL_32 IsBlock4kb(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].is4kb; + } + + // Checking block size + BOOL_32 IsBlock64kb(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].is64kb; + } + + // Checking block size + BOOL_32 IsBlock256kb(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].is256kb; + } + + BOOL_32 Is2dSwizzle(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].is2d; + } + + BOOL_32 Is3dSwizzle(Addr3SwizzleMode swizzleMode) const + { + return m_swizzleModeTable[swizzleMode].is3d; + } + + virtual UINT_32 HwlComputeMaxBaseAlignments() const { return 256 * 1024; } + + virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) + { + ADDR_NOT_IMPLEMENTED(); + // Although GFX12 addressing should be consistent regardless of the configuration, we still need to + // call some initialization for member variables. + return TRUE; + } + + virtual ChipFamily HwlConvertChipFamily( + UINT_32 chipFamily, + UINT_32 chipRevision); + + virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const { return 0; } + + virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTSUPPORTED; + } + + virtual ADDR_E_RETURNCODE HwlComputePipeBankXor( + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTSUPPORTED; + } + + VOID ComputeBlockDimensionForSurf( + ADDR_EXTENT3D* pExtent, + UINT_32 bpp, + UINT_32 numSamples, + Addr3SwizzleMode swizzleMode) const; + + ADDR_EXTENT3D GetMipTailDim( + Addr3SwizzleMode swizzleMode, + const ADDR_EXTENT3D& blockDims) const; + + ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoordLinear( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; + + ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoordTiled( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; + + virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoordTiled( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTIMPLEMENTED; + } + + virtual ADDR_E_RETURNCODE HwlComputeNonBlockCompressedView( + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTSUPPORTED; + } + + virtual VOID HwlComputeSubResourceOffsetForSwizzlePattern( + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + } + + virtual ADDR_E_RETURNCODE HwlComputeSlicePipeBankXor( + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut) const + { + ADDR_NOT_IMPLEMENTED(); + return ADDR_NOTSUPPORTED; + } + + ADDR_E_RETURNCODE ApplyCustomizedPitchHeight( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; + + BOOL_32 UseCustomHeight(const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + BOOL_32 UseCustomPitch(const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + BOOL_32 CanTrimLinearPadding(const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + +private: + // Disallow the copy constructor + Lib(const Lib& a); + + // Disallow the assignment operator + Lib& operator=(const Lib& a); + + void Init(); +}; + +} // V3 +} // Addr +} // namespace rocr + +#endif \ No newline at end of file diff --git a/src/image/addrlib/src/core/addrobject.cpp b/src/image/addrlib/src/core/addrobject.cpp index 2a08b0ae0..f3d3fff27 100644 --- a/src/image/addrlib/src/core/addrobject.cpp +++ b/src/image/addrlib/src/core/addrobject.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -35,7 +18,8 @@ #include "addrobject.h" namespace rocr { -namespace Addr { +namespace Addr +{ /** **************************************************************************************************** @@ -237,4 +221,4 @@ VOID Object::DebugPrint( } } // Addr -} // rocr +} // namespace rocr diff --git a/src/image/addrlib/src/core/addrobject.h b/src/image/addrlib/src/core/addrobject.h index 0d270789a..57205e5b2 100644 --- a/src/image/addrlib/src/core/addrobject.h +++ b/src/image/addrlib/src/core/addrobject.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -38,7 +21,8 @@ #include "addrcommon.h" namespace rocr { -namespace Addr { +namespace Addr +{ /** **************************************************************************************************** @@ -91,6 +75,5 @@ class Object }; } // Addr -} // rocr - +} // namespace rocr #endif diff --git a/src/image/addrlib/src/core/coord.cpp b/src/image/addrlib/src/core/coord.cpp index f371458f4..74644e75e 100644 --- a/src/image/addrlib/src/core/coord.cpp +++ b/src/image/addrlib/src/core/coord.cpp @@ -3,24 +3,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -29,8 +12,10 @@ #include "coord.h" namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ Coordinate::Coordinate() { @@ -600,4 +585,4 @@ BOOL_32 CoordEq::operator!=(const CoordEq& b) } // V2 } // Addr -} // rocr +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/core/coord.h b/src/image/addrlib/src/core/coord.h index 490823f3f..95c4fca68 100644 --- a/src/image/addrlib/src/core/coord.h +++ b/src/image/addrlib/src/core/coord.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -29,8 +12,10 @@ #define __COORD_H namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ #if defined(__cplusplus) #if defined(_MSC_VER) #if _MSC_VER >= 1900 @@ -140,7 +125,6 @@ class CoordEq } // V2 } // Addr -} // rocr - +} // namespace rocr #endif diff --git a/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h b/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h index 3a783bb4b..f476b3984 100644 --- a/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h +++ b/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -34,9 +17,10 @@ #define __GFX10_SWIZZLE_PATTERN_H__ namespace rocr { -namespace Addr { -namespace V2 { - +namespace Addr +{ +namespace V2 +{ const ADDR_SW_PATINFO GFX10_SW_256_S_PATINFO[] = { { 1, 0, 0, 0, 0, } , // 1 pipes 1 bpe @ SW_256_S @ Navi1x @@ -6031,7 +6015,8 @@ const UINT_64 GFX10_CMASK_SW_PATTERN[][17] = {X3, Y3, X7, Y7, X8, Y8, X9, Y9, X10, Y4^X9^Y9, Z3^X4^Y4, Z2^Y5^X8, Z1^X5^Y8, Y6^X7, Z0^X6^Y7, 0, 0, }, //34 }; -} // V2 +}// V2 } // Addr -} // rocr +} // namespace rocr + #endif diff --git a/src/image/addrlib/src/gfx10/gfx10addrlib.cpp b/src/image/addrlib/src/gfx10/gfx10addrlib.cpp index 733252f88..324697b73 100644 --- a/src/image/addrlib/src/gfx10/gfx10addrlib.cpp +++ b/src/image/addrlib/src/gfx10/gfx10addrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -31,15 +14,16 @@ */ #include "gfx10addrlib.h" +#include "addrcommon.h" #include "gfx10_gb_reg.h" #include "amdgpu_asic_addr.h" //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - namespace rocr { -namespace Addr { +namespace Addr +{ /** ************************************************************************************************************************ * Gfx10HwlInit @@ -93,7 +77,7 @@ const SwizzleModeFlags Gfx10Lib::SwizzleModeTable[ADDR_SW_MAX_TYPE] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, // Reserved {{0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0}}, // ADDR_SW_4KB_S_X {{0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0}}, // ADDR_SW_4KB_D_X - {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, // Reserved + {{0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0}}, // ADDR_SW_4KB_R_X {{0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0}}, // ADDR_SW_64KB_Z_X {{0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0}}, // ADDR_SW_64KB_S_X @@ -128,6 +112,7 @@ Gfx10Lib::Gfx10Lib(const Client* pClient) m_numSaLog2(0), m_colorBaseIndex(0), m_xmaskBaseIndex(0), + m_htileBaseIndex(0), m_dccBaseIndex(0) { memset(&m_settings, 0, sizeof(m_settings)); @@ -675,7 +660,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeHtileAddrFromCoord( { const UINT_32 numSampleLog2 = Log2(pIn->numSamples); const UINT_32 pipeMask = (1 << m_pipesLog2) - 1; - const UINT_32 index = m_xmaskBaseIndex + numSampleLog2; + const UINT_32 index = m_htileBaseIndex + numSampleLog2; const UINT_8* patIdxTable = m_settings.supportRbPlus ? GFX10_HTILE_RBPLUS_PATIDX : GFX10_HTILE_PATIDX; const UINT_32 blkSizeLog2 = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) - 4; @@ -948,9 +933,11 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams( { // Skip unaligned case - m_xmaskBaseIndex += MaxNumOfAA; + m_xmaskBaseIndex += MaxNumOfBppCMask; + m_htileBaseIndex += MaxNumOfAA; - m_xmaskBaseIndex += m_pipesLog2 * MaxNumOfAA; + m_xmaskBaseIndex += m_pipesLog2 * MaxNumOfBppCMask; + m_htileBaseIndex += m_pipesLog2 * MaxNumOfAA; m_colorBaseIndex += m_pipesLog2 * MaxNumOfBpp; if (m_settings.supportRbPlus) @@ -966,7 +953,8 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams( if (m_numPkrLog2 >= 2) { m_colorBaseIndex += (2 * m_numPkrLog2 - 2) * MaxNumOfBpp; - m_xmaskBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfAA; + m_xmaskBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfBppCMask; + m_htileBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfAA; } } else @@ -976,9 +964,8 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams( 1; ADDR_C_ASSERT(sizeof(GFX10_HTILE_PATIDX) / sizeof(GFX10_HTILE_PATIDX[0]) == (numPipeType + 1) * MaxNumOfAA); - - ADDR_C_ASSERT(sizeof(GFX10_HTILE_PATIDX) / sizeof(GFX10_HTILE_PATIDX[0]) == - sizeof(GFX10_CMASK_64K_PATIDX) / sizeof(GFX10_CMASK_64K_PATIDX[0])); + ADDR_C_ASSERT(sizeof(GFX10_CMASK_64K_PATIDX) / sizeof(GFX10_CMASK_64K_PATIDX[0]) == + (numPipeType + 1) * MaxNumOfBppCMask); } } @@ -1071,7 +1058,6 @@ ChipFamily Gfx10Lib::HwlConvertChipFamily( ADDR_ASSERT(!"Unknown chip revision"); } break; - case FAMILY_RMB: if (ASICREV_IS_REMBRANDT(chipRevision)) { @@ -1083,15 +1069,15 @@ ChipFamily Gfx10Lib::HwlConvertChipFamily( ADDR_ASSERT(!"Unknown chip revision"); } break; - case FAMILY_GC_10_3_6: - if (ASICREV_IS_GFX1036(chipRevision)) + case FAMILY_RPL: + if (ASICREV_IS_RAPHAEL(chipRevision)) { m_settings.supportRbPlus = 1; m_settings.dccUnsup3DSwDis = 0; } break; - case FAMILY_GC_10_3_7: - if (ASICREV_IS_GFX1037(chipRevision)) + case FAMILY_MDN: + if (ASICREV_IS_MENDOCINO(chipRevision)) { m_settings.supportRbPlus = 1; m_settings.dccUnsup3DSwDis = 0; @@ -1460,13 +1446,15 @@ VOID Gfx10Lib::ConvertSwizzlePatternToEquation( ADDR_EQUATION* pEquation) ///< [out] equation converted from swizzle pattern const { - ADDR_BIT_SETTING fullSwizzlePattern[20]; + // Get full swizzle pattern and store it as an ADDR_BIT_SETTING list + ADDR_BIT_SETTING fullSwizzlePattern[ADDR_MAX_EQUATION_BIT]; GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); const ADDR_BIT_SETTING* pSwizzle = fullSwizzlePattern; const UINT_32 blockSizeLog2 = GetBlockSizeLog2(swMode); - + memset(pEquation, 0, sizeof(ADDR_EQUATION)); pEquation->numBits = blockSizeLog2; + pEquation->numBitComponents = pPatInfo->maxItemCount; pEquation->stackedDepthSlices = FALSE; for (UINT_32 i = 0; i < elemLog2; i++) @@ -1994,37 +1982,45 @@ VOID Gfx10Lib::InitEquationTable() { memset(m_equationTable, 0, sizeof(m_equationTable)); + // Iterate through resourceTypes, up to MaxRsrcType where a "resourceType" refers to AddrResourceType (1D/2D/3D) + // resources. This starts with rsrcTypeIdx = 0, however there is an offset added that will start us off at + // computing 2D resources. for (UINT_32 rsrcTypeIdx = 0; rsrcTypeIdx < MaxRsrcType; rsrcTypeIdx++) { + // Add offset. Start iterating from ADDR_RSRC_TEX_2D const AddrResourceType rsrcType = static_cast(rsrcTypeIdx + ADDR_RSRC_TEX_2D); + // Iterate through the maximum number of swizzlemodes a type can hold for (UINT_32 swModeIdx = 0; swModeIdx < MaxSwModeType; swModeIdx++) { const AddrSwizzleMode swMode = static_cast(swModeIdx); + // Iterate through the different bits-per-pixel settings (8bpp/16bpp/32bpp/64bpp/128bpp) for (UINT_32 elemLog2 = 0; elemLog2 < MaxElementBytesLog2; elemLog2++) { UINT_32 equationIndex = ADDR_INVALID_EQUATION_INDEX; + // May or may not return a ADDR_SW_PATINFO for a completely different swizzle mode, essentially + // overwriting the choice. const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(swMode, rsrcType, elemLog2, 1); if (pPatInfo != NULL) { ADDR_ASSERT(IsValidSwMode(swMode)); - - if (pPatInfo->maxItemCount <= 3) + if (pPatInfo->maxItemCount <= 3) // Get a valid equationIndex { ADDR_EQUATION equation = {}; + // Passing in pPatInfo to get the addr equation ConvertSwizzlePatternToEquation(elemLog2, rsrcType, swMode, pPatInfo, &equation); equationIndex = m_numEquations; ADDR_ASSERT(equationIndex < EquationTableSize); - + // Updates m_equationTable[m_numEquations] to be the addr equation for this PatInfo m_equationTable[equationIndex] = equation; - + // Increment m_numEquations m_numEquations++; } - else + else // There is no equationIndex { // We only see "ill" equation from 64/128 BPE + 3D resource + SW_64KB_D_X under RB+ case ADDR_ASSERT((elemLog2 == 3) || (elemLog2 == 4)); @@ -2033,7 +2029,8 @@ VOID Gfx10Lib::InitEquationTable() ADDR_ASSERT(m_settings.supportRbPlus == 1); } } - + // equationIndex, which is used to look up equations in m_equationTable, will be cached for every + // iteration in this nested for-loop m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2] = equationIndex; } } @@ -2318,9 +2315,9 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeNonBlockCompressedView( { ADDR_E_RETURNCODE returnCode = ADDR_OK; - if (pIn->resourceType != ADDR_RSRC_TEX_2D) + if (IsThin(pIn->resourceType, pIn->swizzleMode) == FALSE) { - // Only 2D resource can have a NonBC view... + // Only thin swizzle mode can have a NonBC view... returnCode = ADDR_INVALIDPARAMS; } else if (((pIn->format < ADDR_FMT_ASTC_4x4) || (pIn->format > ADDR_FMT_ETC2_128BPP)) && @@ -2347,6 +2344,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeNonBlockCompressedView( infoIn.numFrags = 1; ADDR2_MIP_INFO mipInfo[MaxMipLevels] = {}; + ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels); ADDR2_COMPUTE_SURFACE_INFO_OUTPUT infoOut = {}; infoOut.pMipInfo = mipInfo; @@ -2597,6 +2595,7 @@ BOOL_32 Gfx10Lib::ValidateSwModeParams( const BOOL_32 linear = IsLinear(swizzle); const BOOL_32 blk256B = IsBlock256b(swizzle); const BOOL_32 blkVar = IsBlockVariable(swizzle); + const BOOL_32 isNonPrtXor = IsNonPrtXor(swizzle); const BOOL_32 prt = flags.prt; const BOOL_32 fmask = flags.fmask; @@ -2652,7 +2651,7 @@ BOOL_32 Gfx10Lib::ValidateSwModeParams( { if (((swizzleMask & Gfx10Rsrc3dSwModeMask) == 0) || (prt && ((swizzleMask & Gfx10Rsrc3dPrtSwModeMask) == 0)) || - (thin3d && ((swizzleMask & Gfx10Rsrc3dThinSwModeMask) == 0))) + (thin3d && ((swizzleMask & Gfx10Rsrc3dViewAs2dSwModeMask) == 0))) { ADDR_ASSERT_ALWAYS(); valid = FALSE; @@ -2755,7 +2754,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceInfoSanityCheck( * Gfx10Lib::HwlGetPreferredSurfaceSetting * * @brief -* Internal function to get suggested surface information for cliet to use +* Internal function to get suggested surface information for client to use * * @return * ADDR_E_RETURNCODE @@ -2824,7 +2823,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( padSize[i] = PowTwoAlign(padSize[i], sizeAlignInElement); } - if (BlockTypeWithinMemoryBudget(padSize[0], + if (Addr2BlockTypeWithinMemoryBudget(padSize[0], padSize[1], ratioLow, ratioHi, @@ -2969,7 +2968,8 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( if (pIn->flags.view3dAs2dArray) { - allowedSwModeSet.value &= Gfx10Rsrc3dThinSwModeMask; + // SW_LINEAR can be used for 3D thin images, including BCn image format. + allowedSwModeSet.value &= Gfx10Rsrc3dViewAs2dSwModeMask; } break; @@ -3057,7 +3057,9 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( if (pIn->flags.needEquation) { - FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3)); + UINT_32 components = pIn->flags.allowExtEquation ? ADDR_MAX_EQUATION_COMP : + ADDR_MAX_LEGACY_EQUATION_COMP; + FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components); } if (allowedSwModeSet.value == Gfx10LinearSwModeMask) @@ -3076,11 +3078,13 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( allowedSwModeSet.swLinear = 0; } + // A bitfield where each bit represents a block type. Each swizzle mode maps to a block. ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); // Determine block size if there are 2 or more block type candidates if (IsPow2(allowedBlockSet.value) == FALSE) { + // Tracks a valid SwizzleMode for each valid block type AddrSwizzleMode swMode[AddrBlockMaxTiledType] = {}; swMode[AddrBlockLinear] = ADDR_SW_LINEAR; @@ -3103,18 +3107,21 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( swMode[AddrBlockThin64KB] = ADDR_SW_64KB_S; } + // Tracks the size of each valid swizzle mode's surface in bytes UINT_64 padSize[AddrBlockMaxTiledType] = {}; const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2); const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1); - UINT_32 minSizeBlk = AddrBlockMicro; - UINT_64 minSize = 0; + const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u); + UINT_32 minSizeBlk = AddrBlockMicro; // Tracks the most optimal block to use + UINT_64 minSize = 0; // Tracks the minimum acceptable block type ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {}; + // Iterate through all block types for (UINT_32 i = AddrBlockLinear; i < AddrBlockMaxTiledType; i++) { - if (IsBlockTypeAvaiable(allowedBlockSet, static_cast(i))) + if (Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { localIn.swizzleMode = swMode[i]; @@ -3138,7 +3145,8 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( } else { - if (BlockTypeWithinMemoryBudget( + // Checks if the block type is within the memory budget but favors larger blocks + if (Addr2BlockTypeWithinMemoryBudget( minSize, padSize[i], ratioLow, @@ -3187,9 +3195,9 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting( for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++) { if ((i != minSizeBlk) && - IsBlockTypeAvaiable(allowedBlockSet, static_cast(i))) + Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { - if (BlockTypeWithinMemoryBudget( + if (Addr2BlockTypeWithinMemoryBudget( minSize, padSize[i], 0, @@ -3679,6 +3687,7 @@ ADDR_E_RETURNCODE Gfx10Lib::ComputeSurfaceInfoMacroTiled( UINT_64 mipSize[MaxMipLevels]; UINT_64 mipSliceSize[MaxMipLevels]; + ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels); Dim3d fixedTailMaxDim = tailMaxDim; if (m_settings.dsMipmapHtileFix && IsZOrderSwizzle(pIn->swizzleMode) && (index <= 1)) @@ -3895,54 +3904,23 @@ UINT_32 Gfx10Lib::ComputeOffsetFromEquation( { UINT_32 v = 0; - if (pEq->addr[i].valid) + for (UINT_32 c = 0; c < pEq->numBitComponents; c++) { - if (pEq->addr[i].channel == 0) + if (pEq->comps[c][i].valid) { - v ^= (x >> pEq->addr[i].index) & 1; - } - else if (pEq->addr[i].channel == 1) - { - v ^= (y >> pEq->addr[i].index) & 1; - } - else - { - ADDR_ASSERT(pEq->addr[i].channel == 2); - v ^= (z >> pEq->addr[i].index) & 1; - } - } - - if (pEq->xor1[i].valid) - { - if (pEq->xor1[i].channel == 0) - { - v ^= (x >> pEq->xor1[i].index) & 1; - } - else if (pEq->xor1[i].channel == 1) - { - v ^= (y >> pEq->xor1[i].index) & 1; - } - else - { - ADDR_ASSERT(pEq->xor1[i].channel == 2); - v ^= (z >> pEq->xor1[i].index) & 1; - } - } - - if (pEq->xor2[i].valid) - { - if (pEq->xor2[i].channel == 0) - { - v ^= (x >> pEq->xor2[i].index) & 1; - } - else if (pEq->xor2[i].channel == 1) - { - v ^= (y >> pEq->xor2[i].index) & 1; - } - else - { - ADDR_ASSERT(pEq->xor2[i].channel == 2); - v ^= (z >> pEq->xor2[i].index) & 1; + if (pEq->comps[c][i].channel == 0) + { + v ^= (x >> pEq->comps[c][i].index) & 1; + } + else if (pEq->comps[c][i].channel == 1) + { + v ^= (y >> pEq->comps[c][i].index) & 1; + } + else + { + ADDR_ASSERT(pEq->comps[c][i].channel == 2); + v ^= (z >> pEq->comps[c][i].index) & 1; + } } } @@ -4071,6 +4049,8 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo( UINT_32 numFrag ///< Number of fragment ) const { + // Now elemLog2 is going to be used to access the correct index insode of the pPatInfo array so we will start from + // the right location const UINT_32 index = IsXor(swizzleMode) ? (m_colorBaseIndex + elemLog2) : elemLog2; const ADDR_SW_PATINFO* patInfo = NULL; const UINT_32 swizzleMask = 1 << swizzleMode; @@ -4133,8 +4113,15 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo( { if (IsRtOptSwizzle(swizzleMode)) { - patInfo = m_settings.supportRbPlus ? - GFX10_SW_64K_R_X_1xaa_RBPLUS_PATINFO : GFX10_SW_64K_R_X_1xaa_PATINFO; + if (swizzleMode == ADDR_SW_4KB_R_X) + { + patInfo = NULL; + } + else + { + patInfo = m_settings.supportRbPlus ? + GFX10_SW_64K_R_X_1xaa_RBPLUS_PATINFO : GFX10_SW_64K_R_X_1xaa_PATINFO; + } } else if (IsZOrderSwizzle(swizzleMode)) { @@ -4228,6 +4215,10 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo( patInfo = m_settings.supportRbPlus ? GFX10_SW_4K_D_RBPLUS_PATINFO : GFX10_SW_4K_D_PATINFO; } + else if (swizzleMode == ADDR_SW_4KB_R_X) + { + patInfo = NULL; + } else { ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_D_X); @@ -4351,6 +4342,7 @@ ADDR_E_RETURNCODE Gfx10Lib::ComputeSurfaceAddrFromCoordMicroTiled( ADDR2_COMPUTE_SURFACE_INFO_INPUT localIn = {}; ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {}; ADDR2_MIP_INFO mipInfo[MaxMipLevels]; + ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels); localIn.swizzleMode = pIn->swizzleMode; localIn.flags = pIn->flags; @@ -4417,6 +4409,7 @@ ADDR_E_RETURNCODE Gfx10Lib::ComputeSurfaceAddrFromCoordMacroTiled( ADDR2_COMPUTE_SURFACE_INFO_INPUT localIn = {}; ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {}; ADDR2_MIP_INFO mipInfo[MaxMipLevels]; + ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels); localIn.swizzleMode = pIn->swizzleMode; localIn.flags = pIn->flags; @@ -4809,4 +4802,4 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceInfoLinear( } // V2 } // Addr -} // rocr +} // namespace rocr diff --git a/src/image/addrlib/src/gfx10/gfx10addrlib.h b/src/image/addrlib/src/gfx10/gfx10addrlib.h index 542d51a43..9dbaefe0e 100644 --- a/src/image/addrlib/src/gfx10/gfx10addrlib.h +++ b/src/image/addrlib/src/gfx10/gfx10addrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -38,8 +21,10 @@ #include "gfx10SwizzlePattern.h" namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ /** ************************************************************************************************************************ @@ -57,8 +42,8 @@ struct Gfx10ChipSettings UINT_32 supportRbPlus : 1; UINT_32 dsMipmapHtileFix : 1; UINT_32 dccUnsup3DSwDis : 1; - UINT_32 : 2; - UINT_32 reserved2 : 26; + UINT_32 : 4; + UINT_32 reserved2 : 24; }; }; @@ -158,7 +143,11 @@ const UINT_32 Gfx10Rsrc3dPrtSwModeMask = Gfx10Rsrc2dPrtSwModeMask & ~Gfx10Displa const UINT_32 Gfx10Rsrc3dThin64KBSwModeMask = (1u << ADDR_SW_64KB_Z_X) | (1u << ADDR_SW_64KB_R_X); -const UINT_32 Gfx10Rsrc3dThinSwModeMask = Gfx10Rsrc3dThin64KBSwModeMask | Gfx10BlkVarSwModeMask; + +const UINT_32 Gfx10Rsrc3dThinSwModeMask = Gfx10Rsrc3dThin64KBSwModeMask | + Gfx10BlkVarSwModeMask; + +const UINT_32 Gfx10Rsrc3dViewAs2dSwModeMask = Gfx10Rsrc3dThinSwModeMask | Gfx10LinearSwModeMask; const UINT_32 Gfx10Rsrc3dThickSwModeMask = Gfx10Rsrc3dSwModeMask & ~(Gfx10Rsrc3dThinSwModeMask | Gfx10LinearSwModeMask); @@ -166,8 +155,9 @@ const UINT_32 Gfx10Rsrc3dThick4KBSwModeMask = Gfx10Rsrc3dThickSwModeMask & Gfx10 const UINT_32 Gfx10Rsrc3dThick64KBSwModeMask = Gfx10Rsrc3dThickSwModeMask & Gfx10Blk64KBSwModeMask; -const UINT_32 Gfx10MsaaSwModeMask = Gfx10ZSwModeMask | - Gfx10RenderSwModeMask; +const UINT_32 Gfx10MsaaSwModeMask = (Gfx10ZSwModeMask | + Gfx10RenderSwModeMask) + ; const UINT_32 Dcn20NonBpp64SwModeMask = (1u << ADDR_SW_LINEAR) | (1u << ADDR_SW_4KB_S) | @@ -396,6 +386,12 @@ class Gfx10Lib : public Lib UINT_32 log2Elem, UINT_32 numFrag) const; + /** + * Will use the indices, "nibbles", to build an index equation inside pSwizzle + * + * @param pPatInfo Pointer to a patInfo. Contains indices mapping to the 2D nibble arrays which will be used to build an index equation. + * @param pSwizzle Array to write the index equation to. + */ VOID GetSwizzlePatternFromPatternInfo( const ADDR_SW_PATINFO* pPatInfo, ADDR_BIT_SETTING (&pSwizzle)[20]) const @@ -570,12 +566,13 @@ class Gfx10Lib : public Lib UINT_32 m_colorBaseIndex; UINT_32 m_xmaskBaseIndex; + UINT_32 m_htileBaseIndex; UINT_32 m_dccBaseIndex; }; } // V2 } // Addr -} // rocr +} // namespace rocr #endif diff --git a/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h b/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h index c9f92bbea..1cdf84605 100644 --- a/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h +++ b/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -34,9 +17,10 @@ #define __GFX11_SWIZZLE_PATTERN_H__ namespace rocr { -namespace Addr { -namespace V2 { - +namespace Addr +{ +namespace V2 +{ const ADDR_SW_PATINFO GFX11_SW_256_D_PATINFO[] = { { 1, 0, 0, 0, 0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_256_D @@ -3050,7 +3034,7 @@ const UINT_64 GFX11_HTILE_SW_PATTERN[][18] = {0, 0, 0, X3, Y3, X7, Y7, X8, Y8, Y4^X9^Y9, Z1^X4^Y4, Z0^Y5^X8, X5^Y8, Y6^X7, X6^Y7, X9, Y9, X10, }, //17 }; -} // V2 +}// V2 } // Addr -} // rocr +} // namespace rocr #endif diff --git a/src/image/addrlib/src/gfx11/gfx11addrlib.cpp b/src/image/addrlib/src/gfx11/gfx11addrlib.cpp index c56be1a57..bcaa539d4 100644 --- a/src/image/addrlib/src/gfx11/gfx11addrlib.cpp +++ b/src/image/addrlib/src/gfx11/gfx11addrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -37,9 +20,9 @@ //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - namespace rocr { -namespace Addr { +namespace Addr +{ /** ************************************************************************************************************************ * Gfx11HwlInit @@ -289,18 +272,23 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeDccInfo( } else { - const UINT_32 elemLog2 = Log2(pIn->bpp >> 3); - - const BOOL_32 isThick = IsThick(pIn->resourceType, pIn->swizzleMode); - - pOut->compressBlkWidth = isThick ? Block256_3d[elemLog2].w : Block256_2d[elemLog2].w; - pOut->compressBlkHeight = isThick ? Block256_3d[elemLog2].h : Block256_2d[elemLog2].h; - pOut->compressBlkDepth = isThick ? Block256_3d[elemLog2].d : 1; + const UINT_32 elemLog2 = Log2(pIn->bpp >> 3); + const UINT_32 numFragLog2 = Log2(Max(pIn->numFrags, 1u)); + Dim3d compBlock = {}; + + GetCompressedBlockSizeLog2(Gfx11DataColor, + pIn->resourceType, + pIn->swizzleMode, + elemLog2, + numFragLog2, + &compBlock); + pOut->compressBlkWidth = 1 << compBlock.w; + pOut->compressBlkHeight = 1 << compBlock.h; + pOut->compressBlkDepth = 1 << compBlock.d; if (ret == ADDR_OK) { Dim3d metaBlk = {}; - const UINT_32 numFragLog2 = Log2(Max(pIn->numFrags, 1u)); const UINT_32 metaBlkSize = GetMetaBlkSize(Gfx11DataColor, pIn->resourceType, pIn->swizzleMode, @@ -386,6 +374,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeDccInfo( // Get the DCC address equation (copied from DccAddrFromCoord) const UINT_32 elemLog2 = Log2(pIn->bpp >> 3); + const UINT_32 numPipeLog2 = m_pipesLog2; UINT_32 index = m_dccBaseIndex + elemLog2; const UINT_8* patIdxTable = (pIn->swizzleMode == ADDR_SW_64KB_R_X) ? GFX11_DCC_64K_R_X_PATIDX : GFX11_DCC_256K_R_X_PATIDX; @@ -739,27 +728,26 @@ ChipFamily Gfx11Lib::HwlConvertChipFamily( switch (chipFamily) { - case FAMILY_GFX1100: - if (ASICREV_IS_GFX1100(chipRevision)) - { - } - if (ASICREV_IS_GFX1101(chipRevision)) + case FAMILY_NV3: + if (ASICREV_IS_NAVI31_P(chipRevision)) { } - if (ASICREV_IS_GFX1102(chipRevision)) + if (ASICREV_IS_NAVI32_P(chipRevision)) { } - break; - case FAMILY_GFX1103: - if (ASICREV_IS_GFX1103(chipRevision)) + if (ASICREV_IS_NAVI33_P(chipRevision)) { } break; case FAMILY_GFX1150: if (ASICREV_IS_GFX1150(chipRevision)) { + m_settings.isGfx1150 = 1; } break; + case FAMILY_GFX1103: + m_settings.isGfx1103 = 1; + break; default: ADDR_ASSERT(!"Unknown chip family"); break; @@ -1103,13 +1091,14 @@ VOID Gfx11Lib::ConvertSwizzlePatternToEquation( ADDR_EQUATION* pEquation) ///< [out] equation converted from swizzle pattern const { - ADDR_BIT_SETTING fullSwizzlePattern[20]; + ADDR_BIT_SETTING fullSwizzlePattern[ADDR_MAX_EQUATION_BIT]; GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); const ADDR_BIT_SETTING* pSwizzle = fullSwizzlePattern; const UINT_32 blockSizeLog2 = GetBlockSizeLog2(swMode); - + memset(pEquation, 0, sizeof(ADDR_EQUATION)); pEquation->numBits = blockSizeLog2; + pEquation->numBitComponents = pPatInfo->maxItemCount; pEquation->stackedDepthSlices = FALSE; for (UINT_32 i = 0; i < elemLog2; i++) @@ -1658,20 +1647,21 @@ VOID Gfx11Lib::InitEquationTable() { ADDR_ASSERT(IsValidSwMode(swMode)); - if (pPatInfo->maxItemCount <= 3) + if (pPatInfo->maxItemCount <= 3) // Get a valid equationIndex { ADDR_EQUATION equation = {}; + // Passing in pPatInfo to get the addr equation ConvertSwizzlePatternToEquation(elemLog2, rsrcType, swMode, pPatInfo, &equation); equationIndex = m_numEquations; ADDR_ASSERT(equationIndex < EquationTableSize); - + // Updates m_equationTable[m_numEquations] to be the addr equation for this PatInfo m_equationTable[equationIndex] = equation; - + // Increment m_numEquations m_numEquations++; } - else + else // There is no equationIndex { // We only see "ill" equation from 64/128 BPE + 3D resource + SW_64KB_D_X ADDR_ASSERT((elemLog2 == 3) || (elemLog2 == 4)); @@ -1744,7 +1734,19 @@ UINT_32 Gfx11Lib::GetValidDisplaySwizzleModes( if (bpp <= 64) { + const ChipFamily family = GetChipFamily(); + swModeMask = Dcn32SwModeMask; + + if (false + || (m_settings.isGfx1103) + || (m_settings.isGfx1150) + ) + { + // Not all GPUs support displaying with 256kB swizzle modes. + swModeMask &= ~((1u << ADDR_SW_256KB_D_X) | + (1u << ADDR_SW_256KB_R_X)); + } } return swModeMask; @@ -1936,15 +1938,15 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView( { ADDR_E_RETURNCODE returnCode = ADDR_OK; - if (pIn->resourceType != ADDR_RSRC_TEX_2D) + if (IsThin(pIn->resourceType, pIn->swizzleMode) == FALSE) { - // Only 2D resource can have a NonBC view... + // Only thin swizzle mode can have a NonBC view... returnCode = ADDR_INVALIDPARAMS; } - else if ((pIn->format != ADDR_FMT_ASTC_8x8) && + else if (((pIn->format < ADDR_FMT_ASTC_4x4) || (pIn->format > ADDR_FMT_ETC2_128BPP)) && ((pIn->format < ADDR_FMT_BC1) || (pIn->format > ADDR_FMT_BC7))) { - // Only support BC1~BC7 or ASTC_8x8 for now... + // Only support BC1~BC7, ASTC, or ETC2 for now... returnCode = ADDR_NOTSUPPORTED; } else @@ -1957,8 +1959,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView( infoIn.swizzleMode = pIn->swizzleMode; infoIn.resourceType = pIn->resourceType; infoIn.bpp = bpp; - infoIn.width = PowTwoAlign(pIn->width, bcWidth) / bcWidth; - infoIn.height = PowTwoAlign(pIn->height, bcHeight) / bcHeight; + infoIn.width = RoundUpQuotient(pIn->width, bcWidth); + infoIn.height = RoundUpQuotient(pIn->height, bcHeight); infoIn.numSlices = pIn->numSlices; infoIn.numMipLevels = pIn->numMipLevels; infoIn.numSamples = 1; @@ -2010,8 +2012,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView( pOut->pipeBankXor = slicePbXorOut.pipeBankXor; const BOOL_32 inTail = tiled && (pIn->mipId >= infoOut.firstMipIdInTail) ? TRUE : FALSE; - const UINT_32 requestMipWidth = PowTwoAlign(Max(pIn->width >> pIn->mipId, 1u), bcWidth) / bcWidth; - const UINT_32 requestMipHeight = PowTwoAlign(Max(pIn->height >> pIn->mipId, 1u), bcHeight) / bcHeight; + const UINT_32 requestMipWidth = RoundUpQuotient(Max(pIn->width >> pIn->mipId, 1u), bcWidth); + const UINT_32 requestMipHeight = RoundUpQuotient(Max(pIn->height >> pIn->mipId, 1u), bcHeight); if (inTail) { @@ -2061,10 +2063,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView( pOut->mipId = 1; pOut->numMipLevels = 2; - const UINT_32 upperMipWidth = - PowTwoAlign(Max(pIn->width >> (pIn->mipId - 1), 1u), bcWidth) / bcWidth; - const UINT_32 upperMipHeight = - PowTwoAlign(Max(pIn->height >> (pIn->mipId - 1), 1u), bcHeight) / bcHeight; + const UINT_32 upperMipWidth = RoundUpQuotient(Max(pIn->width >> (pIn->mipId - 1), 1u), bcWidth); + const UINT_32 upperMipHeight = RoundUpQuotient(Max(pIn->height >> (pIn->mipId - 1), 1u), bcHeight); const BOOL_32 needToAvoidInTail = tiled && (requestMipWidth <= infoOut.blockWidth / 2) && (requestMipHeight <= infoOut.blockHeight) ? @@ -2226,6 +2226,7 @@ BOOL_32 Gfx11Lib::ValidateSwModeParams( const BOOL_32 thin3d = flags.view3dAs2dArray; const BOOL_32 linear = IsLinear(swizzle); const BOOL_32 blk256B = IsBlock256b(swizzle); + const BOOL_32 isNonPrtXor = IsNonPrtXor(swizzle); const BOOL_32 prt = flags.prt; // Misc check @@ -2571,8 +2572,9 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( pOut->resourceType = pIn->resourceType; pOut->validSwModeSet = allowedSwModeSet; pOut->canXor = (allowedSwModeSet.value & Gfx11XorSwModeMask) ? TRUE : FALSE; - pOut->validBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); - pOut->validSwTypeSet = GetAllowedSwSet(allowedSwModeSet); + + GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType, &(pOut->validBlockSet)); + GetAllowedSwSet(allowedSwModeSet, &(pOut->validSwTypeSet)); pOut->clientPreferredSwSet = pIn->preferredSwSet; @@ -2584,7 +2586,9 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( // Apply optional restrictions if (pIn->flags.needEquation) { - FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3)); + UINT_32 components = pIn->flags.allowExtEquation ? ADDR_MAX_EQUATION_COMP : + ADDR_MAX_LEGACY_EQUATION_COMP; + FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components); } if (allowedSwModeSet.value == Gfx11LinearSwModeMask) @@ -2603,7 +2607,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( allowedSwModeSet.swLinear = 0; } - ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType); + ADDR2_BLOCK_SET allowedBlockSet = {}; + GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType, &allowedBlockSet); // Determine block size if there are 2 or more block type candidates if (IsPow2(allowedBlockSet.value) == FALSE) @@ -2632,6 +2637,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2); const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1); + const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u); UINT_32 minSizeBlk = AddrBlockMicro; UINT_64 minSize = 0; @@ -2639,7 +2645,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( for (UINT_32 i = AddrBlockLinear; i < AddrBlockMaxTiledType; i++) { - if (IsBlockTypeAvaiable(allowedBlockSet, static_cast(i))) + if (Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { localIn.swizzleMode = swMode[i]; @@ -2657,7 +2663,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( padSize[i] = localOut.surfSize; if ((minSize == 0) || - BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi)) + Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi)) { minSize = padSize[i]; minSizeBlk = i; @@ -2702,9 +2708,9 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++) { if ((i != minSizeBlk) && - IsBlockTypeAvaiable(allowedBlockSet, static_cast(i))) + Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { - if (BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE) + if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE) { // Clear the block type if the memory waste is unacceptable allowedBlockSet.value &= ~(1u << (i - 1)); @@ -2776,9 +2782,11 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( } // Block type should be determined. - ADDR_ASSERT(IsPow2(GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType).value)); + GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType, &allowedBlockSet); + ADDR_ASSERT(IsPow2(allowedBlockSet.value)); - ADDR2_SWTYPE_SET allowedSwSet = GetAllowedSwSet(allowedSwModeSet); + ADDR2_SWTYPE_SET allowedSwSet = {}; + GetAllowedSwSet(allowedSwModeSet, &allowedSwSet); // Determine swizzle type if there are 2 or more swizzle type candidates if ((allowedSwSet.value != 0) && (IsPow2(allowedSwSet.value) == FALSE)) @@ -2819,7 +2827,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( { if (pIn->flags.color && allowedSwSet.sw_R) { - allowedSwModeSet.value &= Gfx11DisplaySwModeMask; + allowedSwModeSet.value &= Gfx11RenderSwModeMask; } else if (allowedSwSet.sw_S) { @@ -2827,7 +2835,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( } else if (allowedSwSet.sw_D) { - allowedSwModeSet.value &= Gfx11RenderSwModeMask; + allowedSwModeSet.value &= Gfx11DisplaySwModeMask; } else { @@ -2856,7 +2864,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( } // Swizzle type should be determined. - ADDR_ASSERT(IsPow2(GetAllowedSwSet(allowedSwModeSet).value)); + GetAllowedSwSet(allowedSwModeSet, &allowedSwSet); + ADDR_ASSERT(IsPow2(allowedSwSet.value)); } // Determine swizzle mode now. Always select the "largest" swizzle mode for a given block type + @@ -2883,6 +2892,271 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting( return returnCode; } +/** +************************************************************************************************************************ +* Gfx11Lib::HwlGetPossibleSwizzleModes +* +* @brief +* Returns a list of swizzle modes that are valid from the hardware's perspective for the client to choose from +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx11Lib::HwlGetPossibleSwizzleModes( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, ///< [in] input structure + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (pIn->flags.fmask) + { + // There is no FMASK for GFX11 ASICs. + ADDR_ASSERT_ALWAYS(); + + returnCode = ADDR_INVALIDPARAMS; + } + else + { + UINT_32 bpp = pIn->bpp; + UINT_32 width = Max(pIn->width, 1u); + UINT_32 height = Max(pIn->height, 1u); + + // Set format to INVALID will skip this conversion + if (pIn->format != ADDR_FMT_INVALID) + { + ElemMode elemMode = ADDR_UNCOMPRESSED; + UINT_32 expandX, expandY; + + // Get compression/expansion factors and element mode which indicates compression/expansion + bpp = GetElemLib()->GetBitsPerPixel(pIn->format, + &elemMode, + &expandX, + &expandY); + + UINT_32 basePitch = 0; + GetElemLib()->AdjustSurfaceInfo(elemMode, + expandX, + expandY, + &bpp, + &basePitch, + &width, + &height); + } + + const UINT_32 numSlices = Max(pIn->numSlices, 1u); + const UINT_32 numMipLevels = Max(pIn->numMipLevels, 1u); + const UINT_32 numSamples = Max(pIn->numSamples, 1u); + const BOOL_32 msaa = numSamples > 1; + + // Pre sanity check on non swizzle mode parameters + ADDR2_COMPUTE_SURFACE_INFO_INPUT localIn = {}; + localIn.flags = pIn->flags; + localIn.resourceType = pIn->resourceType; + localIn.format = pIn->format; + localIn.bpp = bpp; + localIn.width = width; + localIn.height = height; + localIn.numSlices = numSlices; + localIn.numMipLevels = numMipLevels; + localIn.numSamples = numSamples; + localIn.numFrags = numSamples; + + if (ValidateNonSwModeParams(&localIn)) + { + // Allow appropriate swizzle modes by default + ADDR2_SWMODE_SET allowedSwModeSet = {}; + allowedSwModeSet.value |= Gfx11LinearSwModeMask | Gfx11Blk256BSwModeMask; + if (pIn->resourceType == ADDR_RSRC_TEX_3D) + { + allowedSwModeSet.value |= Gfx11Rsrc3dThick4KBSwModeMask | + Gfx11Rsrc3dThin64KBSwModeMask | + Gfx11Rsrc3dThick64KBSwModeMask | + Gfx11Rsrc3dThin256KBSwModeMask | + Gfx11Rsrc3dThick256KBSwModeMask; + } + else + { + allowedSwModeSet.value |= Gfx11Blk4KBSwModeMask | Gfx11Blk64KBSwModeMask | Gfx11Blk256KBSwModeMask; + } + + // Filter out invalid swizzle mode(s) by image attributes and HW restrictions + switch (pIn->resourceType) + { + case ADDR_RSRC_TEX_1D: + allowedSwModeSet.value &= Gfx11Rsrc1dSwModeMask; + break; + + case ADDR_RSRC_TEX_2D: + allowedSwModeSet.value &= pIn->flags.prt ? Gfx11Rsrc2dPrtSwModeMask : Gfx11Rsrc2dSwModeMask; + break; + + case ADDR_RSRC_TEX_3D: + allowedSwModeSet.value &= pIn->flags.prt ? Gfx11Rsrc3dPrtSwModeMask : Gfx11Rsrc3dSwModeMask; + + if (pIn->flags.view3dAs2dArray) + { + allowedSwModeSet.value &= Gfx11Rsrc3dThinSwModeMask; + } + break; + + default: + ADDR_ASSERT_ALWAYS(); + allowedSwModeSet.value = 0; + break; + } + + // TODO: figure out if following restrictions are correct on GFX11... + if (ElemLib::IsBlockCompressed(pIn->format) || + ElemLib::IsMacroPixelPacked(pIn->format) || + (bpp > 64) || + (msaa && ((bpp > 32) || pIn->flags.color || pIn->flags.unordered))) + { + allowedSwModeSet.value &= ~Gfx11ZSwModeMask; + } + + if (pIn->format == ADDR_FMT_32_32_32) + { + allowedSwModeSet.value &= Gfx11LinearSwModeMask; + } + + if (msaa) + { + allowedSwModeSet.value &= Gfx11MsaaSwModeMask; + } + + if (pIn->flags.depth || pIn->flags.stencil) + { + allowedSwModeSet.value &= Gfx11ZSwModeMask; + } + + if (pIn->flags.display) + { + allowedSwModeSet.value &= GetValidDisplaySwizzleModes(bpp); + } + + if (allowedSwModeSet.value != 0) + { +#if DEBUG + // Post sanity check, at least AddrLib should accept the output generated by its own + UINT_32 validateSwModeSet = allowedSwModeSet.value; + + for (UINT_32 i = 0; validateSwModeSet != 0; i++) + { + if (validateSwModeSet & 1) + { + localIn.swizzleMode = static_cast(i); + ADDR_ASSERT(ValidateSwModeParams(&localIn)); + } + + validateSwModeSet >>= 1; + } +#endif + + pOut->resourceType = pIn->resourceType; + pOut->clientPreferredSwSet = pIn->preferredSwSet; + + if (pOut->clientPreferredSwSet.value == 0) + { + pOut->clientPreferredSwSet.value = AddrSwSetAll; + } + + if (pIn->flags.needEquation) + { + UINT_32 components = pIn->flags.allowExtEquation ? ADDR_MAX_EQUATION_COMP : + ADDR_MAX_LEGACY_EQUATION_COMP; + FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components); + } + + pOut->validSwModeSet = allowedSwModeSet; + pOut->canXor = (allowedSwModeSet.value & Gfx11XorSwModeMask) ? TRUE : FALSE; + } + else + { + // Invalid combination... + ADDR_ASSERT_ALWAYS(); + returnCode = ADDR_INVALIDPARAMS; + } + } + else + { + // Invalid combination... + ADDR_ASSERT_ALWAYS(); + returnCode = ADDR_INVALIDPARAMS; + } + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Gfx11Lib::HwlGetAllowedBlockSet +* +* @brief +* Returns the set of allowed block sizes given the allowed swizzle modes and resource type +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx11Lib::HwlGetAllowedBlockSet( + ADDR2_SWMODE_SET allowedSwModeSet, ///< [in] allowed swizzle modes + AddrResourceType rsrcType, ///< [in] resource type + ADDR2_BLOCK_SET* pAllowedBlockSet ///< [out] allowed block sizes + ) const +{ + ADDR2_BLOCK_SET allowedBlockSet = {}; + + allowedBlockSet.micro = (allowedSwModeSet.value & Gfx11Blk256BSwModeMask) ? TRUE : FALSE; + allowedBlockSet.linear = (allowedSwModeSet.value & Gfx11LinearSwModeMask) ? TRUE : FALSE; + + if (rsrcType == ADDR_RSRC_TEX_3D) + { + allowedBlockSet.macroThick4KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx11Rsrc3dThin64KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThick64KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick64KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.gfx11.thin256KB = (allowedSwModeSet.value & Gfx11Rsrc3dThin256KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.gfx11.thick256KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick256KBSwModeMask) ? TRUE : FALSE; + } + else + { + allowedBlockSet.macroThin4KB = (allowedSwModeSet.value & Gfx11Blk4KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx11Blk64KBSwModeMask) ? TRUE : FALSE; + allowedBlockSet.gfx11.thin256KB = (allowedSwModeSet.value & Gfx11Blk256KBSwModeMask) ? TRUE : FALSE; + } + + *pAllowedBlockSet = allowedBlockSet; + return ADDR_OK; +} + +/** +************************************************************************************************************************ +* Gfx11Lib::HwlGetAllowedSwSet +* +* @brief +* Returns the set of allowed swizzle types given the allowed swizzle modes +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx11Lib::HwlGetAllowedSwSet( + ADDR2_SWMODE_SET allowedSwModeSet, ///< [in] allowed swizzle modes + ADDR2_SWTYPE_SET* pAllowedSwSet ///< [out] allowed swizzle types + ) const +{ + ADDR2_SWTYPE_SET allowedSwSet = {}; + + allowedSwSet.sw_Z = (allowedSwModeSet.value & Gfx11ZSwModeMask) ? TRUE : FALSE; + allowedSwSet.sw_S = (allowedSwModeSet.value & Gfx11StandardSwModeMask) ? TRUE : FALSE; + allowedSwSet.sw_D = (allowedSwModeSet.value & Gfx11DisplaySwModeMask) ? TRUE : FALSE; + allowedSwSet.sw_R = (allowedSwModeSet.value & Gfx11RenderSwModeMask) ? TRUE : FALSE; + + *pAllowedSwSet = allowedSwSet; + return ADDR_OK; +} + /** ************************************************************************************************************************ * Gfx11Lib::ComputeStereoInfo @@ -3182,12 +3456,12 @@ ADDR_E_RETURNCODE Gfx11Lib::ComputeSurfaceInfoMacroTiled( UINT_64 mipSize[MaxMipLevels]; UINT_64 mipSliceSize[MaxMipLevels]; + // For htile, we need to make z16 and stencil enter the mip tail at the same time as z32 would Dim3d fixedTailMaxDim = tailMaxDim; - - if ((IsZOrderSwizzle(pIn->swizzleMode) || IsRtOptSwizzle(pIn->swizzleMode)) && (index <= 1)) + if (IsZOrderSwizzle(pIn->swizzleMode) && (index <= 1)) { fixedTailMaxDim.w /= Block256_2d[index].w / Block256_2d[2].w; - fixedTailMaxDim.h /= Block256_2d[index].h / Block256_2d[2].h; + fixedTailMaxDim.h /= Block256_2d[index].w / Block256_2d[2].w; } for (UINT_32 i = 0; i < pIn->numMipLevels; i++) @@ -3400,54 +3674,23 @@ UINT_32 Gfx11Lib::ComputeOffsetFromEquation( { UINT_32 v = 0; - if (pEq->addr[i].valid) - { - if (pEq->addr[i].channel == 0) - { - v ^= (x >> pEq->addr[i].index) & 1; - } - else if (pEq->addr[i].channel == 1) - { - v ^= (y >> pEq->addr[i].index) & 1; - } - else - { - ADDR_ASSERT(pEq->addr[i].channel == 2); - v ^= (z >> pEq->addr[i].index) & 1; - } - } - - if (pEq->xor1[i].valid) - { - if (pEq->xor1[i].channel == 0) - { - v ^= (x >> pEq->xor1[i].index) & 1; - } - else if (pEq->xor1[i].channel == 1) - { - v ^= (y >> pEq->xor1[i].index) & 1; - } - else - { - ADDR_ASSERT(pEq->xor1[i].channel == 2); - v ^= (z >> pEq->xor1[i].index) & 1; - } - } - - if (pEq->xor2[i].valid) + for (UINT_32 c = 0; c < pEq->numBitComponents; c++) { - if (pEq->xor2[i].channel == 0) - { - v ^= (x >> pEq->xor2[i].index) & 1; - } - else if (pEq->xor2[i].channel == 1) - { - v ^= (y >> pEq->xor2[i].index) & 1; - } - else + if (pEq->comps[c][i].valid) { - ADDR_ASSERT(pEq->xor2[i].channel == 2); - v ^= (z >> pEq->xor2[i].index) & 1; + if (pEq->comps[c][i].channel == 0) + { + v ^= (x >> pEq->comps[c][i].index) & 1; + } + else if (pEq->comps[c][i].channel == 1) + { + v ^= (y >> pEq->comps[c][i].index) & 1; + } + else + { + ADDR_ASSERT(pEq->comps[c][i].channel == 2); + v ^= (z >> pEq->comps[c][i].index) & 1; + } } } @@ -4033,6 +4276,7 @@ UINT_32 Gfx11Lib::HwlComputeMaxMetaBaseAlignments() const } // Max base alignment for 2D Dcc + // swizzle mode support DCC... const AddrSwizzleMode ValidSwizzleModeForDcc2D[] = { ADDR_SW_64KB_R_X, @@ -4250,4 +4494,4 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeSurfaceInfoLinear( } // V2 } // Addr -} // rocr +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/gfx11/gfx11addrlib.h b/src/image/addrlib/src/gfx11/gfx11addrlib.h index 391eda2cb..78ffc46b4 100644 --- a/src/image/addrlib/src/gfx11/gfx11addrlib.h +++ b/src/image/addrlib/src/gfx11/gfx11addrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -38,8 +21,10 @@ #include "gfx11SwizzlePattern.h" namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ /** ************************************************************************************************************************ @@ -50,7 +35,9 @@ struct Gfx11ChipSettings { struct { - UINT_32 reserved1 : 32; + UINT_32 isGfx1150 : 1; + UINT_32 isGfx1103 : 1; + UINT_32 reserved1 : 30; // Misc configuration bits UINT_32 reserved2 : 32; @@ -285,6 +272,19 @@ class Gfx11Lib : public Lib const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const; + virtual ADDR_E_RETURNCODE HwlGetPossibleSwizzleModes( + const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn, + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const; + + virtual ADDR_E_RETURNCODE HwlGetAllowedBlockSet( + ADDR2_SWMODE_SET allowedSwModeSet, + AddrResourceType rsrcType, + ADDR2_BLOCK_SET* pAllowedBlockSet) const; + + virtual ADDR_E_RETURNCODE HwlGetAllowedSwSet( + ADDR2_SWMODE_SET allowedSwModeSet, + ADDR2_SWTYPE_SET* pAllowedSwSet) const; + virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfoSanityCheck( const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const; @@ -467,43 +467,6 @@ class Gfx11Lib : public Lib UINT_32 GetMaxNumMipsInTail(UINT_32 blockSizeLog2, BOOL_32 isThin) const; - static ADDR2_BLOCK_SET GetAllowedBlockSet(ADDR2_SWMODE_SET allowedSwModeSet, AddrResourceType rsrcType) - { - ADDR2_BLOCK_SET allowedBlockSet = {}; - - allowedBlockSet.micro = (allowedSwModeSet.value & Gfx11Blk256BSwModeMask) ? TRUE : FALSE; - allowedBlockSet.linear = (allowedSwModeSet.value & Gfx11LinearSwModeMask) ? TRUE : FALSE; - - if (rsrcType == ADDR_RSRC_TEX_3D) - { - allowedBlockSet.macroThick4KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick4KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx11Rsrc3dThin64KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macroThick64KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick64KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.gfx11.thin256KB = (allowedSwModeSet.value & Gfx11Rsrc3dThin256KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.gfx11.thick256KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick256KBSwModeMask) ? TRUE : FALSE; - } - else - { - allowedBlockSet.macroThin4KB = (allowedSwModeSet.value & Gfx11Blk4KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.macroThin64KB = (allowedSwModeSet.value & Gfx11Blk64KBSwModeMask) ? TRUE : FALSE; - allowedBlockSet.gfx11.thin256KB = (allowedSwModeSet.value & Gfx11Blk256KBSwModeMask) ? TRUE : FALSE; - } - - return allowedBlockSet; - } - - static ADDR2_SWTYPE_SET GetAllowedSwSet(ADDR2_SWMODE_SET allowedSwModeSet) - { - ADDR2_SWTYPE_SET allowedSwSet = {}; - - allowedSwSet.sw_Z = (allowedSwModeSet.value & Gfx11ZSwModeMask) ? TRUE : FALSE; - allowedSwSet.sw_S = (allowedSwModeSet.value & Gfx11StandardSwModeMask) ? TRUE : FALSE; - allowedSwSet.sw_D = (allowedSwModeSet.value & Gfx11DisplaySwModeMask) ? TRUE : FALSE; - allowedSwSet.sw_R = (allowedSwModeSet.value & Gfx11RenderSwModeMask) ? TRUE : FALSE; - - return allowedSwSet; - } - BOOL_32 IsInMipTail( Dim3d mipTailDim, UINT_32 maxNumMipsInTail, @@ -555,5 +518,6 @@ class Gfx11Lib : public Lib } // V2 } // Addr -} // rocr +} // namespace rocr #endif + diff --git a/src/image/addrlib/src/gfx12/gfx12SwizzlePattern.h b/src/image/addrlib/src/gfx12/gfx12SwizzlePattern.h new file mode 100644 index 000000000..55508066d --- /dev/null +++ b/src/image/addrlib/src/gfx12/gfx12SwizzlePattern.h @@ -0,0 +1,280 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ + +/** +************************************************************************************************************************ +* @file gfx12SwizzlePattern.h +* @brief swizzle pattern for gfx12. +************************************************************************************************************************ +*/ + +#ifndef __GFX12_SWIZZLE_PATTERN_H__ +#define __GFX12_SWIZZLE_PATTERN_H__ + +namespace rocr { +namespace Addr +{ +namespace V3 +{ + const ADDR_SW_PATINFO GFX12_SW_256B_2D_1xAA_PATINFO[] = + { + { 0, 0, 0, 0, } , // 1 BPE @ SW_256B_2D_1xAA + { 1, 0, 0, 0, } , // 2 BPE @ SW_256B_2D_1xAA + { 2, 0, 0, 0, } , // 4 BPE @ SW_256B_2D_1xAA + { 3, 0, 0, 0, } , // 8 BPE @ SW_256B_2D_1xAA + { 4, 0, 0, 0, } , // 16 BPE @ SW_256B_2D_1xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256B_2D_2xAA_PATINFO[] = + { + { 5, 0, 0, 0, } , // 1 BPE @ SW_256B_2D_2xAA + { 6, 0, 0, 0, } , // 2 BPE @ SW_256B_2D_2xAA + { 7, 0, 0, 0, } , // 4 BPE @ SW_256B_2D_2xAA + { 8, 0, 0, 0, } , // 8 BPE @ SW_256B_2D_2xAA + { 9, 0, 0, 0, } , // 16 BPE @ SW_256B_2D_2xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256B_2D_4xAA_PATINFO[] = + { + { 10, 0, 0, 0, } , // 1 BPE @ SW_256B_2D_4xAA + { 11, 0, 0, 0, } , // 2 BPE @ SW_256B_2D_4xAA + { 12, 0, 0, 0, } , // 4 BPE @ SW_256B_2D_4xAA + { 13, 0, 0, 0, } , // 8 BPE @ SW_256B_2D_4xAA + { 14, 0, 0, 0, } , // 16 BPE @ SW_256B_2D_4xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256B_2D_8xAA_PATINFO[] = + { + { 15, 0, 0, 0, } , // 1 BPE @ SW_256B_2D_8xAA + { 16, 0, 0, 0, } , // 2 BPE @ SW_256B_2D_8xAA + { 17, 0, 0, 0, } , // 4 BPE @ SW_256B_2D_8xAA + { 18, 0, 0, 0, } , // 8 BPE @ SW_256B_2D_8xAA + { 19, 0, 0, 0, } , // 16 BPE @ SW_256B_2D_8xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_4KB_2D_1xAA_PATINFO[] = + { + { 0, 1, 0, 0, } , // 1 BPE @ SW_4KB_2D_1xAA + { 1, 2, 0, 0, } , // 2 BPE @ SW_4KB_2D_1xAA + { 2, 3, 0, 0, } , // 4 BPE @ SW_4KB_2D_1xAA + { 3, 4, 0, 0, } , // 8 BPE @ SW_4KB_2D_1xAA + { 4, 5, 0, 0, } , // 16 BPE @ SW_4KB_2D_1xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_4KB_2D_2xAA_PATINFO[] = + { + { 5, 2, 0, 0, } , // 1 BPE @ SW_4KB_2D_2xAA + { 6, 3, 0, 0, } , // 2 BPE @ SW_4KB_2D_2xAA + { 7, 4, 0, 0, } , // 4 BPE @ SW_4KB_2D_2xAA + { 8, 5, 0, 0, } , // 8 BPE @ SW_4KB_2D_2xAA + { 9, 6, 0, 0, } , // 16 BPE @ SW_4KB_2D_2xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_4KB_2D_4xAA_PATINFO[] = + { + { 10, 3, 0, 0, } , // 1 BPE @ SW_4KB_2D_4xAA + { 11, 4, 0, 0, } , // 2 BPE @ SW_4KB_2D_4xAA + { 12, 5, 0, 0, } , // 4 BPE @ SW_4KB_2D_4xAA + { 13, 6, 0, 0, } , // 8 BPE @ SW_4KB_2D_4xAA + { 14, 7, 0, 0, } , // 16 BPE @ SW_4KB_2D_4xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_4KB_2D_8xAA_PATINFO[] = + { + { 15, 4, 0, 0, } , // 1 BPE @ SW_4KB_2D_8xAA + { 16, 5, 0, 0, } , // 2 BPE @ SW_4KB_2D_8xAA + { 17, 6, 0, 0, } , // 4 BPE @ SW_4KB_2D_8xAA + { 18, 7, 0, 0, } , // 8 BPE @ SW_4KB_2D_8xAA + { 19, 8, 0, 0, } , // 16 BPE @ SW_4KB_2D_8xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_64KB_2D_1xAA_PATINFO[] = + { + { 0, 1, 1, 0, } , // 1 BPE @ SW_64KB_2D_1xAA + { 1, 2, 2, 0, } , // 2 BPE @ SW_64KB_2D_1xAA + { 2, 3, 3, 0, } , // 4 BPE @ SW_64KB_2D_1xAA + { 3, 4, 4, 0, } , // 8 BPE @ SW_64KB_2D_1xAA + { 4, 5, 5, 0, } , // 16 BPE @ SW_64KB_2D_1xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_64KB_2D_2xAA_PATINFO[] = + { + { 5, 2, 2, 0, } , // 1 BPE @ SW_64KB_2D_2xAA + { 6, 3, 3, 0, } , // 2 BPE @ SW_64KB_2D_2xAA + { 7, 4, 4, 0, } , // 4 BPE @ SW_64KB_2D_2xAA + { 8, 5, 5, 0, } , // 8 BPE @ SW_64KB_2D_2xAA + { 9, 6, 6, 0, } , // 16 BPE @ SW_64KB_2D_2xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_64KB_2D_4xAA_PATINFO[] = + { + { 10, 3, 3, 0, } , // 1 BPE @ SW_64KB_2D_4xAA + { 11, 4, 4, 0, } , // 2 BPE @ SW_64KB_2D_4xAA + { 12, 5, 5, 0, } , // 4 BPE @ SW_64KB_2D_4xAA + { 13, 6, 6, 0, } , // 8 BPE @ SW_64KB_2D_4xAA + { 14, 7, 7, 0, } , // 16 BPE @ SW_64KB_2D_4xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_64KB_2D_8xAA_PATINFO[] = + { + { 15, 4, 4, 0, } , // 1 BPE @ SW_64KB_2D_8xAA + { 16, 5, 5, 0, } , // 2 BPE @ SW_64KB_2D_8xAA + { 17, 6, 6, 0, } , // 4 BPE @ SW_64KB_2D_8xAA + { 18, 7, 7, 0, } , // 8 BPE @ SW_64KB_2D_8xAA + { 19, 8, 8, 0, } , // 16 BPE @ SW_64KB_2D_8xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256KB_2D_1xAA_PATINFO[] = + { + { 0, 1, 1, 1, } , // 1 BPE @ SW_256KB_2D_1xAA + { 1, 2, 2, 2, } , // 2 BPE @ SW_256KB_2D_1xAA + { 2, 3, 3, 3, } , // 4 BPE @ SW_256KB_2D_1xAA + { 3, 4, 4, 4, } , // 8 BPE @ SW_256KB_2D_1xAA + { 4, 5, 5, 5, } , // 16 BPE @ SW_256KB_2D_1xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256KB_2D_2xAA_PATINFO[] = + { + { 5, 2, 2, 2, } , // 1 BPE @ SW_256KB_2D_2xAA + { 6, 3, 3, 3, } , // 2 BPE @ SW_256KB_2D_2xAA + { 7, 4, 4, 4, } , // 4 BPE @ SW_256KB_2D_2xAA + { 8, 5, 5, 5, } , // 8 BPE @ SW_256KB_2D_2xAA + { 9, 6, 6, 6, } , // 16 BPE @ SW_256KB_2D_2xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256KB_2D_4xAA_PATINFO[] = + { + { 10, 3, 3, 3, } , // 1 BPE @ SW_256KB_2D_4xAA + { 11, 4, 4, 4, } , // 2 BPE @ SW_256KB_2D_4xAA + { 12, 5, 5, 5, } , // 4 BPE @ SW_256KB_2D_4xAA + { 13, 6, 6, 6, } , // 8 BPE @ SW_256KB_2D_4xAA + { 14, 7, 7, 7, } , // 16 BPE @ SW_256KB_2D_4xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_256KB_2D_8xAA_PATINFO[] = + { + { 15, 4, 4, 4, } , // 1 BPE @ SW_256KB_2D_8xAA + { 16, 5, 5, 5, } , // 2 BPE @ SW_256KB_2D_8xAA + { 17, 6, 6, 6, } , // 4 BPE @ SW_256KB_2D_8xAA + { 18, 7, 7, 7, } , // 8 BPE @ SW_256KB_2D_8xAA + { 19, 8, 8, 8, } , // 16 BPE @ SW_256KB_2D_8xAA + }; + + const ADDR_SW_PATINFO GFX12_SW_4KB_3D_PATINFO[] = + { + { 20, 9, 0, 0, } , // 1 BPE @ SW_4KB_3D + { 21, 10, 0, 0, } , // 2 BPE @ SW_4KB_3D + { 22, 11, 0, 0, } , // 4 BPE @ SW_4KB_3D + { 23, 12, 0, 0, } , // 8 BPE @ SW_4KB_3D + { 24, 13, 0, 0, } , // 16 BPE @ SW_4KB_3D + }; + + const ADDR_SW_PATINFO GFX12_SW_64KB_3D_PATINFO[] = + { + { 20, 9, 9, 0, } , // 1 BPE @ SW_64KB_3D + { 21, 10, 10, 0, } , // 2 BPE @ SW_64KB_3D + { 22, 11, 11, 0, } , // 4 BPE @ SW_64KB_3D + { 23, 12, 12, 0, } , // 8 BPE @ SW_64KB_3D + { 24, 13, 13, 0, } , // 16 BPE @ SW_64KB_3D + }; + + const ADDR_SW_PATINFO GFX12_SW_256KB_3D_PATINFO[] = + { + { 20, 9, 9, 9, } , // 1 BPE @ SW_256KB_3D + { 21, 10, 10, 9, } , // 2 BPE @ SW_256KB_3D + { 22, 11, 11, 10, } , // 4 BPE @ SW_256KB_3D + { 23, 12, 12, 11, } , // 8 BPE @ SW_256KB_3D + { 24, 13, 13, 11, } , // 16 BPE @ SW_256KB_3D + }; + + + const UINT_64 GFX12_SW_PATTERN_NIBBLE1[][8] = + { + {X0, X1, Y0, X2, Y1, Y2, X3, Y3, }, // 0 + {0, X0, Y0, X1, Y1, X2, Y2, X3, }, // 1 + {0, 0, X0, Y0, X1, Y1, X2, Y2, }, // 2 + {0, 0, 0, X0, Y0, X1, X2, Y1, }, // 3 + {0, 0, 0, 0, X0, Y0, X1, Y1, }, // 4 + {S0, X0, Y0, X1, Y1, X2, Y2, X3, }, // 5 + {0, S0, X0, Y0, X1, Y1, X2, Y2, }, // 6 + {0, 0, S0, X0, Y0, X1, Y1, X2, }, // 7 + {0, 0, 0, S0, X0, Y0, X1, Y1, }, // 8 + {0, 0, 0, 0, S0, X0, Y0, X1, }, // 9 + {S0, S1, X0, Y0, X1, Y1, X2, Y2, }, // 10 + {0, S0, S1, X0, Y0, X1, Y1, X2, }, // 11 + {0, 0, S0, S1, X0, Y0, X1, Y1, }, // 12 + {0, 0, 0, S0, S1, X0, Y0, X1, }, // 13 + {0, 0, 0, 0, S0, S1, X0, Y0, }, // 14 + {S0, S1, S2, X0, Y0, X1, Y1, X2, }, // 15 + {0, S0, S1, S2, X0, Y0, X1, Y1, }, // 16 + {0, 0, S0, S1, S2, X0, Y0, X1, }, // 17 + {0, 0, 0, S0, S1, S2, X0, Y0, }, // 18 + {0, 0, 0, 0, S0, S1, S2, X0, }, // 19 + {X0, X1, Z0, Y0, Y1, Z1, X2, Z2, }, // 20 + {0, X0, Z0, Y0, X1, Z1, Y1, Z2, }, // 21 + {0, 0, X0, Y0, X1, Z0, Y1, Z1, }, // 22 + {0, 0, 0, X0, Y0, Z0, X1, Z1, }, // 23 + {0, 0, 0, 0, X0, Z0, Y0, Z1, }, // 24 + }; + + const UINT_64 GFX12_SW_PATTERN_NIBBLE2[][4] = + { + {0, 0, 0, 0, }, // 0 + {Y4, X4, Y5, X5, }, // 1 + {Y3, X4, Y4, X5, }, // 2 + {Y3, X3, Y4, X4, }, // 3 + {Y2, X3, Y3, X4, }, // 4 + {Y2, X2, Y3, X3, }, // 5 + {Y1, X2, Y2, X3, }, // 6 + {Y1, X1, Y2, X2, }, // 7 + {Y0, X1, Y1, X2, }, // 8 + {Y2, X3, Z3, Y3, }, // 9 + {Y2, X2, Z3, Y3, }, // 10 + {Y2, X2, Z2, Y3, }, // 11 + {Y1, X2, Z2, Y2, }, // 12 + {Y1, X1, Z2, Y2, }, // 13 + }; + + const UINT_64 GFX12_SW_PATTERN_NIBBLE3[][4] = + { + {0, 0, 0, 0, }, // 0 + {Y6, X6, Y7, X7, }, // 1 + {Y5, X6, Y6, X7, }, // 2 + {Y5, X5, Y6, X6, }, // 3 + {Y4, X5, Y5, X6, }, // 4 + {Y4, X4, Y5, X5, }, // 5 + {Y3, X4, Y4, X5, }, // 6 + {Y3, X3, Y4, X4, }, // 7 + {Y2, X3, Y3, X4, }, // 8 + {X4, Z4, Y4, X5, }, // 9 + {X3, Z4, Y4, X4, }, // 10 + {X3, Z3, Y4, X4, }, // 11 + {X3, Z3, Y3, X4, }, // 12 + {X2, Z3, Y3, X3, }, // 13 + }; + + const UINT_64 GFX12_SW_PATTERN_NIBBLE4[][2] = + { + {0, 0, }, // 0 + {Y8, X8, }, // 1 + {Y7, X8, }, // 2 + {Y7, X7, }, // 3 + {Y6, X7, }, // 4 + {Y6, X6, }, // 5 + {Y5, X6, }, // 6 + {Y5, X5, }, // 7 + {Y4, X5, }, // 8 + {Z5, Y5, }, // 9 + {Z4, Y5, }, // 10 + {Z4, Y4, }, // 11 + }; + +} // V3 +} // Addr +} // namespace +#endif diff --git a/src/image/addrlib/src/gfx12/gfx12addrlib.cpp b/src/image/addrlib/src/gfx12/gfx12addrlib.cpp new file mode 100644 index 000000000..1b2e5e563 --- /dev/null +++ b/src/image/addrlib/src/gfx12/gfx12addrlib.cpp @@ -0,0 +1,1313 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ + +/** +************************************************************************************************************************ +* @file gfx12addrlib.cpp +* @brief Contain the implementation for the Gfx12Lib class. +************************************************************************************************************************ +*/ + +#include "gfx12addrlib.h" +#include "gfx12_gb_reg.h" + +#include "amdgpu_asic_addr.h" + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +namespace rocr { +namespace Addr +{ +/** +************************************************************************************************************************ +* Gfx12HwlInit +* +* @brief +* Creates an Gfx12Lib object. +* +* @return +* Returns an Gfx12Lib object pointer. +************************************************************************************************************************ +*/ +Addr::Lib* Gfx12HwlInit( + const Client* pClient) +{ + return V3::Gfx12Lib::CreateObj(pClient); +} + +namespace V3 +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Static Const Member +//////////////////////////////////////////////////////////////////////////////////////////////////// +const SwizzleModeFlags Gfx12Lib::SwizzleModeTable[ADDR3_MAX_TYPE] = +{//Linear 2d 3d 256B 4KB 64KB 256KB Reserved + {{1, 0, 0, 0, 0, 0, 0, 0}}, // ADDR3_LINEAR + {{0, 1, 0, 1, 0, 0, 0, 0}}, // ADDR3_256B_2D + {{0, 1, 0, 0, 1, 0, 0, 0}}, // ADDR3_4KB_2D + {{0, 1, 0, 0, 0, 1, 0, 0}}, // ADDR3_64KB_2D + {{0, 1, 0, 0, 0, 0, 1, 0}}, // ADDR3_256KB_2D + {{0, 0, 1, 0, 1, 0, 0, 0}}, // ADDR3_4KB_3D + {{0, 0, 1, 0, 0, 1, 0, 0}}, // ADDR3_64KB_3D + {{0, 0, 1, 0, 0, 0, 1, 0}}, // ADDR3_256KB_3D +}; + +const ADDR_EXTENT3D Gfx12Lib::Block4K_Log2_3d[] = {{4, 4, 4}, {3, 4, 4}, {3, 4, 3}, {3, 3, 3}, {2, 3, 3}}; +const ADDR_EXTENT3D Gfx12Lib::Block64K_Log2_3d[] = {{6, 5, 5}, {5, 5, 5}, {5, 5, 4}, {5, 4, 4}, {4, 4, 4}}; +const ADDR_EXTENT3D Gfx12Lib::Block256K_Log2_3d[] = {{6, 6, 6}, {5, 6, 6}, {5, 6, 5}, {5, 5, 5}, {4, 5, 5}}; + +/** +************************************************************************************************************************ +* Gfx12Lib::Gfx12Lib +* +* @brief +* Constructor +* +************************************************************************************************************************ +*/ +Gfx12Lib::Gfx12Lib( + const Client* pClient) + : + Lib(pClient), + m_numSwizzleBits(0) +{ + memset(&m_settings, 0, sizeof(m_settings)); + memcpy(m_swizzleModeTable, SwizzleModeTable, sizeof(SwizzleModeTable)); +} + +/** +************************************************************************************************************************ +* Gfx12Lib::~Gfx12Lib +* +* @brief +* Destructor +************************************************************************************************************************ +*/ +Gfx12Lib::~Gfx12Lib() +{ +} + +/** +************************************************************************************************************************ +* Gfx12Lib::ConvertSwizzlePatternToEquation +* +* @brief +* Convert swizzle pattern to equation. +* +* @return +* N/A +************************************************************************************************************************ +*/ +VOID Gfx12Lib::ConvertSwizzlePatternToEquation( + UINT_32 elemLog2, ///< [in] element bytes log2 + Addr3SwizzleMode swMode, ///< [in] swizzle mode + const ADDR_SW_PATINFO* pPatInfo, ///< [in] swizzle pattern info + ADDR_EQUATION* pEquation) ///< [out] equation converted from swizzle pattern + const +{ + ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K]; + GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern); + + const ADDR_BIT_SETTING* pSwizzle = fullSwizzlePattern; + const UINT_32 blockSizeLog2 = GetBlockSizeLog2(swMode, TRUE); + + pEquation->numBits = blockSizeLog2; + pEquation->stackedDepthSlices = FALSE; + + for (UINT_32 i = 0; i < elemLog2; i++) + { + pEquation->addr[i].channel = 0; + pEquation->addr[i].valid = 1; + pEquation->addr[i].index = i; + } + + for (UINT_32 i = elemLog2; i < blockSizeLog2; i++) + { + ADDR_ASSERT(IsPow2(pSwizzle[i].value)); + + if (pSwizzle[i].x != 0) + { + ADDR_ASSERT(IsPow2(static_cast(pSwizzle[i].x))); + + pEquation->addr[i].channel = 0; + pEquation->addr[i].valid = 1; + pEquation->addr[i].index = Log2(pSwizzle[i].x) + elemLog2; + } + else if (pSwizzle[i].y != 0) + { + ADDR_ASSERT(IsPow2(static_cast(pSwizzle[i].y))); + + pEquation->addr[i].channel = 1; + pEquation->addr[i].valid = 1; + pEquation->addr[i].index = Log2(pSwizzle[i].y); + } + else if (pSwizzle[i].z != 0) + { + ADDR_ASSERT(IsPow2(static_cast(pSwizzle[i].z))); + + pEquation->addr[i].channel = 2; + pEquation->addr[i].valid = 1; + pEquation->addr[i].index = Log2(pSwizzle[i].z); + } + else if (pSwizzle[i].s != 0) + { + ADDR_ASSERT(IsPow2(static_cast(pSwizzle[i].s))); + + pEquation->addr[i].channel = 3; + pEquation->addr[i].valid = 1; + pEquation->addr[i].index = Log2(pSwizzle[i].s); + } + else + { + ADDR_ASSERT_ALWAYS(); + } + } +} + +/** +************************************************************************************************************************ +* Gfx12Lib::InitEquationTable +* +* @brief +* Initialize Equation table. +* +* @return +* N/A +************************************************************************************************************************ +*/ +VOID Gfx12Lib::InitEquationTable() +{ + memset(m_equationTable, 0, sizeof(m_equationTable)); + + for (UINT_32 swModeIdx = 0; swModeIdx < ADDR3_MAX_TYPE; swModeIdx++) + { + const Addr3SwizzleMode swMode = static_cast(swModeIdx); + + if (IsLinear(swMode)) + { + // Skip linear equation (data table is not useful for 2D/3D images-- only contains x-coordinate bits) + continue; + } + + const UINT_32 maxMsaa = Is2dSwizzle(swMode) ? MaxMsaaRateLog2 : 1; + + for (UINT_32 msaaIdx = 0; msaaIdx < maxMsaa; msaaIdx++) + { + for (UINT_32 elemLog2 = 0; elemLog2 < MaxElementBytesLog2; elemLog2++) + { + UINT_32 equationIndex = ADDR_INVALID_EQUATION_INDEX; + const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(swMode, elemLog2, 1 << msaaIdx); + + if (pPatInfo != NULL) + { + ADDR_ASSERT(IsValidSwMode(swMode)); + + ADDR_EQUATION equation = {}; + + ConvertSwizzlePatternToEquation(elemLog2, swMode, pPatInfo, &equation); + + equationIndex = m_numEquations; + ADDR_ASSERT(equationIndex < NumSwizzlePatterns); + + m_equationTable[equationIndex] = equation; + m_numEquations++; + } + SetEquationTableEntry(swMode, msaaIdx, elemLog2, equationIndex); + } + } + } +} + +/** +************************************************************************************************************************ +* Gfx12Lib::GetBlockPixelDimensions +* +* @brief +* Returns the pixel dimensions of one block. +* +************************************************************************************************************************ +*/ +ADDR_EXTENT3D Gfx12Lib::GetBlockPixelDimensions( + Addr3SwizzleMode swizzleMode, + UINT_32 log2BytesPerPixel + ) const +{ + ADDR_EXTENT3D log2Dim = {}; + + switch (swizzleMode) + { + case ADDR3_4KB_3D: + log2Dim = Block4K_Log2_3d[log2BytesPerPixel]; + break; + case ADDR3_64KB_3D: + log2Dim = Block64K_Log2_3d[log2BytesPerPixel]; + break; + case ADDR3_256KB_3D: + log2Dim = Block256K_Log2_3d[log2BytesPerPixel]; + break; + default: + ADDR_ASSERT_ALWAYS(); + break; + } + + return { 1u << log2Dim.width, 1u << log2Dim.height, 1u << log2Dim.depth }; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::GetMipOrigin +* +* @brief +* Internal function to calculate origins of the mip levels +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +VOID Gfx12Lib::GetMipOrigin( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] input structure + const ADDR_EXTENT3D& mipExtentFirstInTail, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut ///< [out] output structure + ) const +{ + const BOOL_32 is3d = Is3dSwizzle(pIn->swizzleMode); + const UINT_32 bytesPerPixel = pIn->bpp >> 3; + const UINT_32 log2Bpp = Log2(bytesPerPixel); + const ADDR_EXTENT3D pixelBlockDims = GetBlockPixelDimensions(ADDR3_4KB_3D, log2Bpp); + const ADDR_EXTENT3D tailMaxDim = GetMipTailDim(pIn->swizzleMode, + pOut->blockExtent); + const UINT_32 blockSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode); + const UINT_32 maxMipsInTail = GetMaxNumMipsInTail(pIn->swizzleMode, blockSizeLog2); + + UINT_32 pitch = tailMaxDim.width; + UINT_32 height = tailMaxDim.height; + + UINT_32 depth = (is3d ? PowTwoAlign(mipExtentFirstInTail.depth, pixelBlockDims.depth) : 1); + + const UINT_32 tailMaxDepth = (is3d ? (depth / pixelBlockDims.depth) : 1); + + for (UINT_32 i = pOut->firstMipIdInTail; i < pIn->numMipLevels; i++) + { + INT_32 mipInTail = static_cast(i) - static_cast(pOut->firstMipIdInTail); + if ((mipInTail < 0) || (pIn->numMipLevels == 1)) + { + mipInTail = MaxMipLevels; + } + + // "m" can be negative + const INT_32 signedM = static_cast(maxMipsInTail) - static_cast(1) - mipInTail; + const UINT_32 m = Max(0, signedM); + const UINT_32 mipOffset = (m > 6) ? (16 << m) : (m << 8); + + pOut->pMipInfo[i].offset = mipOffset * tailMaxDepth; + pOut->pMipInfo[i].mipTailOffset = mipOffset; + pOut->pMipInfo[i].macroBlockOffset = 0; + + pOut->pMipInfo[i].pitch = pitch; + pOut->pMipInfo[i].height = height; + pOut->pMipInfo[i].depth = depth; + + if (IsLinear(pIn->swizzleMode)) + { + pOut->pMipInfo[i].mipTailCoordX = mipOffset >> 8; + pOut->pMipInfo[i].mipTailCoordY = 0; + pOut->pMipInfo[i].mipTailCoordZ = 0; + + pitch = Max(pitch >> 1, 1u); + } + else + { + UINT_32 mipX = ((mipOffset >> 9) & 1) | + ((mipOffset >> 10) & 2) | + ((mipOffset >> 11) & 4) | + ((mipOffset >> 12) & 8) | + ((mipOffset >> 13) & 16) | + ((mipOffset >> 14) & 32); + UINT_32 mipY = ((mipOffset >> 8) & 1) | + ((mipOffset >> 9) & 2) | + ((mipOffset >> 10) & 4) | + ((mipOffset >> 11) & 8) | + ((mipOffset >> 12) & 16) | + ((mipOffset >> 13) & 32); + + if (is3d == FALSE) + { + pOut->pMipInfo[i].mipTailCoordX = mipX * Block256_2d[log2Bpp].w; + pOut->pMipInfo[i].mipTailCoordY = mipY * Block256_2d[log2Bpp].h; + pOut->pMipInfo[i].mipTailCoordZ = 0; + + pitch = Max(pitch >> 1, Block256_2d[log2Bpp].w); + height = Max(height >> 1, Block256_2d[log2Bpp].h); + depth = 1; + } + else + { + pOut->pMipInfo[i].mipTailCoordX = mipX * pixelBlockDims.width; + pOut->pMipInfo[i].mipTailCoordY = mipY * pixelBlockDims.height; + pOut->pMipInfo[i].mipTailCoordZ = 0; + + pitch = Max(pitch >> 1, pixelBlockDims.width); + height = Max(height >> 1, pixelBlockDims.height); + depth = PowTwoAlign(Max(depth >> 1, 1u), pixelBlockDims.depth); + } + } + } +} + +/** +************************************************************************************************************************ +* Gfx12Lib::GetMipOffset +* +* @brief +* Internal function to calculate alignment for a surface +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +VOID Gfx12Lib::GetMipOffset( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut ///< [out] output structure + ) const +{ + const UINT_32 bytesPerPixel = pIn->bpp >> 3; + const UINT_32 log2Bpp = Log2(bytesPerPixel); + const UINT_32 blockSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode); + const UINT_32 blockSize = 1 << blockSizeLog2; + const ADDR_EXTENT3D tailMaxDim = GetMipTailDim(pIn->swizzleMode, + pOut->blockExtent); + const ADDR_EXTENT3D mip0Dims = GetBaseMipExtents(pIn); + const UINT_32 maxMipsInTail = GetMaxNumMipsInTail(pIn->swizzleMode, blockSizeLog2); + + UINT_32 firstMipInTail = pIn->numMipLevels; + UINT_64 mipChainSliceSize = 0; + UINT_64 mipSize[MaxMipLevels]; + UINT_64 mipSliceSize[MaxMipLevels]; + + const ADDR_EXTENT3D fixedTailMaxDim = tailMaxDim; + + for (UINT_32 mipIdx = 0; mipIdx < pIn->numMipLevels; mipIdx++) + { + const ADDR_EXTENT3D mipExtents = GetMipExtent(mip0Dims, mipIdx); + + if (SupportsMipTail(pIn->swizzleMode) && + IsInMipTail(fixedTailMaxDim, mipExtents, maxMipsInTail, pIn->numMipLevels - mipIdx)) + { + firstMipInTail = mipIdx; + mipChainSliceSize += blockSize / pOut->blockExtent.depth; + break; + } + else + { + const UINT_32 pitch = UseCustomPitch(pIn) + ? pOut->pitch + : ((mipIdx == 0) && CanTrimLinearPadding(pIn)) + ? PowTwoAlign(mipExtents.width, 128u / bytesPerPixel) + : PowTwoAlign(mipExtents.width, pOut->blockExtent.width); + const UINT_32 height = UseCustomHeight(pIn) + ? pOut->height + : PowTwoAlign(mipExtents.height, pOut->blockExtent.height); + const UINT_32 depth = PowTwoAlign(mipExtents.depth, pOut->blockExtent.depth); + + // The original "blockExtent" calculation does subtraction of logs (i.e., division) to get the + // sizes. We aligned our pitch and height to those sizes, which means we need to multiply the various + // factors back together to get back to the slice size. + const UINT_64 sliceSize = static_cast(pitch) * height * pIn->numSamples * (pIn->bpp >> 3); + + mipSize[mipIdx] = sliceSize * depth; + mipSliceSize[mipIdx] = sliceSize * pOut->blockExtent.depth; + mipChainSliceSize += sliceSize; + + if (pOut->pMipInfo != NULL) + { + pOut->pMipInfo[mipIdx].pitch = pitch; + pOut->pMipInfo[mipIdx].height = height; + pOut->pMipInfo[mipIdx].depth = depth; + + // The slice size of a linear image was calculated above as if the "pitch" is 256 byte aligned. + // However, the rendering pitch is aligned to 128 bytes, and that is what needs to be reported + // to our clients. + if (IsLinear(pIn->swizzleMode)) + { + pOut->pMipInfo[mipIdx].pitch = PowTwoAlign(mipExtents.width, 128u / bytesPerPixel); + } + } + } + } + + pOut->sliceSize = mipChainSliceSize; + pOut->surfSize = mipChainSliceSize * pOut->numSlices; + pOut->mipChainInTail = (firstMipInTail == 0) ? TRUE : FALSE; + pOut->firstMipIdInTail = firstMipInTail; + + if (pOut->pMipInfo != NULL) + { + if (IsLinear(pIn->swizzleMode)) + { + // 1. Linear swizzle mode doesn't have miptails. + // 2. The organization of linear 3D mipmap resource is same as GFX11, we should use mip slice size to + // caculate mip offset. + ADDR_ASSERT(firstMipInTail == pIn->numMipLevels); + + UINT_64 sliceSize = 0; + + for (INT_32 i = static_cast(pIn->numMipLevels) - 1; i >= 0; i--) + { + pOut->pMipInfo[i].offset = sliceSize; + pOut->pMipInfo[i].macroBlockOffset = sliceSize; + pOut->pMipInfo[i].mipTailOffset = 0; + + sliceSize += mipSliceSize[i]; + } + } + else + { + UINT_64 offset = 0; + UINT_64 macroBlkOffset = 0; + UINT_32 tailMaxDepth = 0; + + ADDR_EXTENT3D mipExtentFirstInTail = {}; + if (firstMipInTail != pIn->numMipLevels) + { + mipExtentFirstInTail = GetMipExtent(mip0Dims, firstMipInTail); + + offset = blockSize * + PowTwoAlign(mipExtentFirstInTail.depth, + pOut->blockExtent.depth) / pOut->blockExtent.depth; + macroBlkOffset = blockSize; + } + + for (INT_32 i = firstMipInTail - 1; i >= 0; i--) + { + pOut->pMipInfo[i].offset = offset; + pOut->pMipInfo[i].macroBlockOffset = macroBlkOffset; + pOut->pMipInfo[i].mipTailOffset = 0; + + offset += mipSize[i]; + macroBlkOffset += mipSliceSize[i]; + } + + GetMipOrigin(pIn, mipExtentFirstInTail, pOut); + } + } +} + +/** +************************************************************************************************************************ +* Gfx12Lib::HwlComputeSurfaceInfo +* +* @brief +* Internal function to calculate alignment for a surface +* +* @return +* VOID +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceInfo( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut ///< [out] output structure + ) const +{ + ComputeBlockDimensionForSurf(&pOut->blockExtent, + pIn->bpp, + pIn->numSamples, + pIn->swizzleMode); + + ADDR_E_RETURNCODE returnCode = ApplyCustomizedPitchHeight(pIn, pOut); + + if (returnCode == ADDR_OK) + { + pOut->numSlices = PowTwoAlign(pIn->numSlices, pOut->blockExtent.depth); + pOut->baseAlign = 1 << GetBlockSizeLog2(pIn->swizzleMode); + + GetMipOffset(pIn, pOut); + + SanityCheckSurfSize(pIn, pOut); + + // Slices must be exact multiples of the block sizes. However: + // - with 3D images, one block will contain multiple slices, so that needs to be taken into account. + // - with linear images that have only once slice, we may trim and use the pitch alignment for size. + ADDR_ASSERT(((pOut->sliceSize * pOut->blockExtent.depth) % + GetBlockSize(pIn->swizzleMode, CanTrimLinearPadding(pIn))) == 0); + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::GetBaseMipExtents +* +* @brief +* Return the size of the base mip level in a nice cozy little structure. +* +************************************************************************************************************************ +*/ +ADDR_EXTENT3D Gfx12Lib::GetBaseMipExtents( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn + ) const +{ + return { pIn->width, + pIn->height, + (IsTex3d(pIn->resourceType) ? pIn->numSlices : 1) }; // slices is depth for 3d +} + +/** +************************************************************************************************************************ +* Gfx12Lib::GetMaxNumMipsInTail +* +* @brief +* Return max number of mips in tails +* +* @return +* Max number of mips in tails +************************************************************************************************************************ +*/ +UINT_32 Gfx12Lib::GetMaxNumMipsInTail( + Addr3SwizzleMode swizzleMode, + UINT_32 blockSizeLog2 ///< block size log2 + ) const +{ + UINT_32 effectiveLog2 = blockSizeLog2; + UINT_32 mipsInTail = 1; + + if (Is3dSwizzle(swizzleMode)) + { + effectiveLog2 -= (blockSizeLog2 - 8) / 3; + } + + if (effectiveLog2 > 8) + { + mipsInTail = (effectiveLog2 <= 11) ? (1 + (1 << (effectiveLog2 - 9))) : (effectiveLog2 - 4); + } + + return mipsInTail; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled +* +* @brief +* Internal function to calculate address from coord for tiled swizzle surface +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut ///< [out] output structure + ) const +{ + // 256B block cannot support 3D image. + ADDR_ASSERT((IsTex3d(pIn->resourceType) && IsBlock256b(pIn->swizzleMode)) == FALSE); + + ADDR3_COMPUTE_SURFACE_INFO_INPUT localIn = {}; + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {}; + ADDR3_MIP_INFO mipInfo[MaxMipLevels]; + + localIn.size = sizeof(localIn); + localIn.flags = pIn->flags; + localIn.swizzleMode = pIn->swizzleMode; + localIn.resourceType = pIn->resourceType; + localIn.format = ADDR_FMT_INVALID; + localIn.bpp = pIn->bpp; + localIn.width = Max(pIn->unAlignedDims.width, 1u); + localIn.height = Max(pIn->unAlignedDims.height, 1u); + localIn.numSlices = Max(pIn->unAlignedDims.depth, 1u); + localIn.numMipLevels = Max(pIn->numMipLevels, 1u); + localIn.numSamples = Max(pIn->numSamples, 1u); + + localOut.size = sizeof(localOut); + localOut.pMipInfo = mipInfo; + + ADDR_E_RETURNCODE ret = ComputeSurfaceInfo(&localIn, &localOut); + + if (ret == ADDR_OK) + { + const UINT_32 elemLog2 = Log2(pIn->bpp >> 3); + const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode); + const UINT_32 eqIndex = GetEquationTableEntry(pIn->swizzleMode, Log2(localIn.numSamples), elemLog2); + + if (eqIndex != ADDR_INVALID_EQUATION_INDEX) + { + const BOOL_32 inTail = ((mipInfo[pIn->mipId].mipTailOffset != 0) && (blkSizeLog2 != Log2Size256)); + const BOOL_32 is3dNoMsaa = ((IsTex3d(pIn->resourceType) == TRUE) && (localIn.numSamples == 1)); + const UINT_64 sliceSize = is3dNoMsaa ? (localOut.sliceSize * localOut.blockExtent.depth) + : localOut.sliceSize; + const UINT_32 sliceId = is3dNoMsaa ? (pIn->slice / localOut.blockExtent.depth) : pIn->slice; + const UINT_32 x = inTail ? (pIn->x + mipInfo[pIn->mipId].mipTailCoordX) : pIn->x; + const UINT_32 y = inTail ? (pIn->y + mipInfo[pIn->mipId].mipTailCoordY) : pIn->y; + const UINT_32 z = inTail ? (pIn->slice + mipInfo[pIn->mipId].mipTailCoordZ) : pIn->slice; + const UINT_32 pb = mipInfo[pIn->mipId].pitch / localOut.blockExtent.width; + const UINT_32 yb = pIn->y / localOut.blockExtent.height; + const UINT_32 xb = pIn->x / localOut.blockExtent.width; + const UINT_64 blkIdx = yb * pb + xb; + const UINT_32 blkOffset = ComputeOffsetFromEquation(&m_equationTable[eqIndex], + x << elemLog2, + y, + z, + pIn->sample); + pOut->addr = sliceSize * sliceId + + mipInfo[pIn->mipId].macroBlockOffset + + (blkIdx << blkSizeLog2) + + blkOffset; + } + else + { + ret = ADDR_INVALIDPARAMS; + } + } + + return ret; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::HwlComputePipeBankXor +* +* @brief +* Generate a PipeBankXor value to be ORed into bits above numSwizzleBits of address +* +* @return +* PipeBankXor value +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx12Lib::HwlComputePipeBankXor( + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut ///< [out] output structure + ) const +{ + if ((m_numSwizzleBits != 0) && // does this configuration support swizzling + // base address XOR in GFX12 will be applied to all blk_size = 4KB, 64KB, or 256KB swizzle modes, + // Note that Linear and 256B are excluded. + (IsLinear(pIn->swizzleMode) == FALSE) && + (IsBlock256b(pIn->swizzleMode) == FALSE)) + { + pOut->pipeBankXor = pIn->surfIndex % (1 << m_numSwizzleBits); + } + else + { + pOut->pipeBankXor = 0; + } + + return ADDR_OK; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::ComputeOffsetFromEquation +* +* @brief +* Compute offset from equation +* +* @return +* Offset +************************************************************************************************************************ +*/ +UINT_32 Gfx12Lib::ComputeOffsetFromEquation( + const ADDR_EQUATION* pEq, ///< Equation + UINT_32 x, ///< x coord in bytes + UINT_32 y, ///< y coord in pixel + UINT_32 z, ///< z coord in slice + UINT_32 s ///< MSAA sample index + ) const +{ + UINT_32 offset = 0; + + for (UINT_32 i = 0; i < pEq->numBits; i++) + { + UINT_32 v = 0; + + if (pEq->addr[i].valid) + { + if (pEq->addr[i].channel == 0) + { + v ^= (x >> pEq->addr[i].index) & 1; + } + else if (pEq->addr[i].channel == 1) + { + v ^= (y >> pEq->addr[i].index) & 1; + } + else if (pEq->addr[i].channel == 2) + { + v ^= (z >> pEq->addr[i].index) & 1; + } + else if (pEq->addr[i].channel == 3) + { + v ^= (s >> pEq->addr[i].index) & 1; + } + else + { + ADDR_ASSERT_ALWAYS(); + } + } + + offset |= (v << i); + } + + return offset; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::GetSwizzlePatternInfo +* +* @brief +* Get swizzle pattern +* +* @return +* Swizzle pattern information +************************************************************************************************************************ +*/ +const ADDR_SW_PATINFO* Gfx12Lib::GetSwizzlePatternInfo( + Addr3SwizzleMode swizzleMode, ///< Swizzle mode + UINT_32 elemLog2, ///< Element size in bytes log2 + UINT_32 numFrag ///< Number of fragment + ) const +{ + const ADDR_SW_PATINFO* patInfo = NULL; + + if (Is2dSwizzle(swizzleMode) == FALSE) + { + ADDR_ASSERT(numFrag == 1); + } + + switch (swizzleMode) + { + case ADDR3_256KB_2D: + switch (numFrag) + { + case 1: + patInfo = GFX12_SW_256KB_2D_1xAA_PATINFO; + break; + case 2: + patInfo = GFX12_SW_256KB_2D_2xAA_PATINFO; + break; + case 4: + patInfo = GFX12_SW_256KB_2D_4xAA_PATINFO; + break; + case 8: + patInfo = GFX12_SW_256KB_2D_8xAA_PATINFO; + break; + default: + ADDR_ASSERT_ALWAYS(); + } + break; + case ADDR3_256KB_3D: + patInfo = GFX12_SW_256KB_3D_PATINFO; + break; + case ADDR3_64KB_2D: + switch (numFrag) + { + case 1: + patInfo = GFX12_SW_64KB_2D_1xAA_PATINFO; + break; + case 2: + patInfo = GFX12_SW_64KB_2D_2xAA_PATINFO; + break; + case 4: + patInfo = GFX12_SW_64KB_2D_4xAA_PATINFO; + break; + case 8: + patInfo = GFX12_SW_64KB_2D_8xAA_PATINFO; + break; + default: + ADDR_ASSERT_ALWAYS(); + } + break; + case ADDR3_64KB_3D: + patInfo = GFX12_SW_64KB_3D_PATINFO; + break; + case ADDR3_4KB_2D: + switch (numFrag) + { + case 1: + patInfo = GFX12_SW_4KB_2D_1xAA_PATINFO; + break; + case 2: + patInfo = GFX12_SW_4KB_2D_2xAA_PATINFO; + break; + case 4: + patInfo = GFX12_SW_4KB_2D_4xAA_PATINFO; + break; + case 8: + patInfo = GFX12_SW_4KB_2D_8xAA_PATINFO; + break; + default: + ADDR_ASSERT_ALWAYS(); + } + break; + case ADDR3_4KB_3D: + patInfo = GFX12_SW_4KB_3D_PATINFO; + break; + case ADDR3_256B_2D: + switch (numFrag) + { + case 1: + patInfo = GFX12_SW_256B_2D_1xAA_PATINFO; + break; + case 2: + patInfo = GFX12_SW_256B_2D_2xAA_PATINFO; + break; + case 4: + patInfo = GFX12_SW_256B_2D_4xAA_PATINFO; + break; + case 8: + patInfo = GFX12_SW_256B_2D_8xAA_PATINFO; + break; + default: + break; + } + break; + default: + ADDR_ASSERT_ALWAYS(); + break; + } + + return (patInfo != NULL) ? &patInfo[elemLog2] : NULL; +} +/** +************************************************************************************************************************ +* Gfx12Lib::HwlInitGlobalParams +* +* @brief +* Initializes global parameters +* +* @return +* TRUE if all settings are valid +* +************************************************************************************************************************ +*/ +BOOL_32 Gfx12Lib::HwlInitGlobalParams( + const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input +{ + BOOL_32 valid = TRUE; + GB_ADDR_CONFIG_GFX12 gbAddrConfig; + + gbAddrConfig.u32All = pCreateIn->regValue.gbAddrConfig; + + switch (gbAddrConfig.bits.NUM_PIPES) + { + case ADDR_CONFIG_1_PIPE: + m_pipesLog2 = 0; + break; + case ADDR_CONFIG_2_PIPE: + m_pipesLog2 = 1; + break; + case ADDR_CONFIG_4_PIPE: + m_pipesLog2 = 2; + break; + case ADDR_CONFIG_8_PIPE: + m_pipesLog2 = 3; + break; + case ADDR_CONFIG_16_PIPE: + m_pipesLog2 = 4; + break; + case ADDR_CONFIG_32_PIPE: + m_pipesLog2 = 5; + break; + case ADDR_CONFIG_64_PIPE: + m_pipesLog2 = 6; + break; + default: + ADDR_ASSERT_ALWAYS(); + valid = FALSE; + break; + } + + switch (gbAddrConfig.bits.PIPE_INTERLEAVE_SIZE) + { + case ADDR_CONFIG_PIPE_INTERLEAVE_256B: + m_pipeInterleaveLog2 = 8; + break; + case ADDR_CONFIG_PIPE_INTERLEAVE_512B: + m_pipeInterleaveLog2 = 9; + break; + case ADDR_CONFIG_PIPE_INTERLEAVE_1KB: + m_pipeInterleaveLog2 = 10; + break; + case ADDR_CONFIG_PIPE_INTERLEAVE_2KB: + m_pipeInterleaveLog2 = 11; + break; + default: + ADDR_ASSERT_ALWAYS(); + valid = FALSE; + break; + } + + m_numSwizzleBits = ((m_pipesLog2 >= 3) ? m_pipesLog2 - 2 : 0); + + if (valid) + { + InitEquationTable(); + } + + return valid; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::HwlComputeNonBlockCompressedView +* +* @brief +* Compute non-block-compressed view for a given mipmap level/slice. +* +* @return +* ADDR_E_RETURNCODE +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx12Lib::HwlComputeNonBlockCompressedView( + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + if (((pIn->format < ADDR_FMT_ASTC_4x4) || (pIn->format > ADDR_FMT_ETC2_128BPP)) && + ((pIn->format < ADDR_FMT_BC1) || (pIn->format > ADDR_FMT_BC7))) + { + // Only support BC1~BC7, ASTC, or ETC2 for now... + returnCode = ADDR_NOTSUPPORTED; + } + else + { + UINT_32 bcWidth, bcHeight; + const UINT_32 bpp = GetElemLib()->GetBitsPerPixel(pIn->format, NULL, &bcWidth, &bcHeight); + + ADDR3_COMPUTE_SURFACE_INFO_INPUT infoIn = {}; + infoIn.size = sizeof(infoIn); + infoIn.flags = pIn->flags; + infoIn.swizzleMode = pIn->swizzleMode; + infoIn.resourceType = pIn->resourceType; + infoIn.format = pIn->format; + infoIn.bpp = bpp; + infoIn.width = RoundUpQuotient(pIn->unAlignedDims.width, bcWidth); + infoIn.height = RoundUpQuotient(pIn->unAlignedDims.height, bcHeight); + infoIn.numSlices = pIn->unAlignedDims.depth; + infoIn.numMipLevels = pIn->numMipLevels; + infoIn.numSamples = 1; + + ADDR3_MIP_INFO mipInfo[MaxMipLevels] = {}; + + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT infoOut = {}; + infoOut.size = sizeof(infoOut); + infoOut.pMipInfo = mipInfo; + + returnCode = HwlComputeSurfaceInfo(&infoIn, &infoOut); + + if (returnCode == ADDR_OK) + { + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT subOffIn = {}; + subOffIn.size = sizeof(subOffIn); + subOffIn.swizzleMode = infoIn.swizzleMode; + subOffIn.resourceType = infoIn.resourceType; + subOffIn.pipeBankXor = pIn->pipeBankXor; + subOffIn.slice = pIn->slice; + subOffIn.sliceSize = infoOut.sliceSize; + subOffIn.macroBlockOffset = mipInfo[pIn->mipId].macroBlockOffset; + subOffIn.mipTailOffset = mipInfo[pIn->mipId].mipTailOffset; + + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT subOffOut = {}; + subOffOut.size = sizeof(subOffOut); + + // For any mipmap level, move nonBc view base address by offset + HwlComputeSubResourceOffsetForSwizzlePattern(&subOffIn, &subOffOut); + pOut->offset = subOffOut.offset; + + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT slicePbXorIn = {}; + slicePbXorIn.size = sizeof(slicePbXorIn); + slicePbXorIn.swizzleMode = infoIn.swizzleMode; + slicePbXorIn.resourceType = infoIn.resourceType; + slicePbXorIn.bpe = infoIn.bpp; + slicePbXorIn.basePipeBankXor = pIn->pipeBankXor; + slicePbXorIn.slice = pIn->slice; + slicePbXorIn.numSamples = 1; + + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT slicePbXorOut = {}; + slicePbXorOut.size = sizeof(slicePbXorOut); + + // For any mipmap level, nonBc view should use computed pbXor + HwlComputeSlicePipeBankXor(&slicePbXorIn, &slicePbXorOut); + pOut->pipeBankXor = slicePbXorOut.pipeBankXor; + + const BOOL_32 tiled = (pIn->swizzleMode != ADDR3_LINEAR); + const BOOL_32 inTail = tiled && (pIn->mipId >= infoOut.firstMipIdInTail); + const UINT_32 requestMipWidth = + RoundUpQuotient(Max(pIn->unAlignedDims.width >> pIn->mipId, 1u), bcWidth); + const UINT_32 requestMipHeight = + RoundUpQuotient(Max(pIn->unAlignedDims.height >> pIn->mipId, 1u), bcHeight); + + if (inTail) + { + // For mipmap level that is in mip tail block, hack a lot of things... + // Basically all mipmap levels in tail block will be viewed as a small mipmap chain that all levels + // are fit in tail block: + + // - mipId = relative mip id (which is counted from first mip ID in tail in original mip chain) + pOut->mipId = pIn->mipId - infoOut.firstMipIdInTail; + + // - at least 2 mipmap levels (since only 1 mipmap level will not be viewed as mipmap!) + pOut->numMipLevels = Max(infoIn.numMipLevels - infoOut.firstMipIdInTail, 2u); + + // - (mip0) width = requestMipWidth << mipId, the value can't exceed mip tail dimension threshold + pOut->unAlignedDims.width = Min(requestMipWidth << pOut->mipId, infoOut.blockExtent.width / 2); + + // - (mip0) height = requestMipHeight << mipId, the value can't exceed mip tail dimension threshold + pOut->unAlignedDims.height = Min(requestMipHeight << pOut->mipId, infoOut.blockExtent.height); + } + // This check should cover at least mipId == 0 + else if ((requestMipWidth << pIn->mipId) == infoIn.width) + { + // For mipmap level [N] that is not in mip tail block and downgraded without losing element: + // - only one mipmap level and mipId = 0 + pOut->mipId = 0; + pOut->numMipLevels = 1; + + // (mip0) width = requestMipWidth + pOut->unAlignedDims.width = requestMipWidth; + + // (mip0) height = requestMipHeight + pOut->unAlignedDims.height = requestMipHeight; + } + else + { + // For mipmap level [N] that is not in mip tail block and downgraded with element losing, + // We have to make it a multiple mipmap view (2 levels view here), add one extra element if needed, + // because single mip view may have different pitch value than original (multiple) mip view... + // A simple case would be: + // - 64KB block swizzle mode, 8 Bytes-Per-Element. Block dim = [0x80, 0x40] + // - 2 mipmap levels with API mip0 width = 0x401/mip1 width = 0x200 and non-BC view + // mip0 width = 0x101/mip1 width = 0x80 + // By multiple mip view, the pitch for mip level 1 would be 0x100 bytes, due to rounding up logic in + // GetMipSize(), and by single mip level view the pitch will only be 0x80 bytes. + + // - 2 levels and mipId = 1 + pOut->mipId = 1; + pOut->numMipLevels = 2; + + const UINT_32 upperMipWidth = + RoundUpQuotient(Max(pIn->unAlignedDims.width >> (pIn->mipId - 1), 1u), bcWidth); + const UINT_32 upperMipHeight = + RoundUpQuotient(Max(pIn->unAlignedDims.height >> (pIn->mipId - 1), 1u), bcHeight); + + const BOOL_32 needToAvoidInTail = tiled && + (requestMipWidth <= infoOut.blockExtent.width / 2) && + (requestMipHeight <= infoOut.blockExtent.height); + + const UINT_32 hwMipWidth = + PowTwoAlign(ShiftCeil(infoIn.width, pIn->mipId), infoOut.blockExtent.width); + const UINT_32 hwMipHeight = + PowTwoAlign(ShiftCeil(infoIn.height, pIn->mipId), infoOut.blockExtent.height); + + const BOOL_32 needExtraWidth = + ((upperMipWidth < requestMipWidth * 2) || + ((upperMipWidth == requestMipWidth * 2) && + ((needToAvoidInTail == TRUE) || + (hwMipWidth > PowTwoAlign(requestMipWidth, infoOut.blockExtent.width))))); + + const BOOL_32 needExtraHeight = + ((upperMipHeight < requestMipHeight * 2) || + ((upperMipHeight == requestMipHeight * 2) && + ((needToAvoidInTail == TRUE) || + (hwMipHeight > PowTwoAlign(requestMipHeight, infoOut.blockExtent.height))))); + + // (mip0) width = requestLastMipLevelWidth + pOut->unAlignedDims.width = upperMipWidth + (needExtraWidth ? 1: 0); + + // (mip0) height = requestLastMipLevelHeight + pOut->unAlignedDims.height = upperMipHeight + (needExtraHeight ? 1: 0); + } + + // Assert the downgrading from this mip[0] width would still generate correct mip[N] width + ADDR_ASSERT(ShiftRight(pOut->unAlignedDims.width, pOut->mipId) == requestMipWidth); + // Assert the downgrading from this mip[0] height would still generate correct mip[N] height + ADDR_ASSERT(ShiftRight(pOut->unAlignedDims.height, pOut->mipId) == requestMipHeight); + } + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::HwlComputeSubResourceOffsetForSwizzlePattern +* +* @brief +* Compute sub resource offset to support swizzle pattern +* +* @return +* VOID +************************************************************************************************************************ +*/ +VOID Gfx12Lib::HwlComputeSubResourceOffsetForSwizzlePattern( + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut ///< [out] output structure + ) const +{ + pOut->offset = pIn->slice * pIn->sliceSize + pIn->macroBlockOffset; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::HwlComputeSlicePipeBankXor +* +* @brief +* Generate slice PipeBankXor value based on base PipeBankXor value and slice id +* +* @return +* PipeBankXor value +************************************************************************************************************************ +*/ +ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSlicePipeBankXor( + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, ///< [in] input structure + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut ///< [out] output structure + ) const +{ + ADDR_E_RETURNCODE returnCode = ADDR_OK; + + // PipeBankXor is only applied to 4KB, 64KB and 256KB on GFX12. + if ((IsLinear(pIn->swizzleMode) == FALSE) && (IsBlock256b(pIn->swizzleMode) == FALSE)) + { + if (pIn->bpe == 0) + { + // Require a valid bytes-per-element value passed from client... + returnCode = ADDR_INVALIDPARAMS; + } + else + { + const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(pIn->swizzleMode, + Log2(pIn->bpe >> 3), + 1); + + if (pPatInfo != NULL) + { + const UINT_32 elemLog2 = Log2(pIn->bpe >> 3); + const UINT_32 eqIndex = GetEquationTableEntry(pIn->swizzleMode, Log2(pIn->numSamples), elemLog2); + + const UINT_32 pipeBankXorOffset = ComputeOffsetFromEquation(&m_equationTable[eqIndex], + 0, + 0, + pIn->slice, + 0); + + const UINT_32 pipeBankXor = pipeBankXorOffset >> m_pipeInterleaveLog2; + + // Should have no bit set under pipe interleave + ADDR_ASSERT((pipeBankXor << m_pipeInterleaveLog2) == pipeBankXorOffset); + + pOut->pipeBankXor = pIn->basePipeBankXor ^ pipeBankXor; + } + else + { + // Should never come here... + ADDR_NOT_IMPLEMENTED(); + + returnCode = ADDR_NOTSUPPORTED; + } + } + } + else + { + pOut->pipeBankXor = 0; + } + + return returnCode; +} + +/** +************************************************************************************************************************ +* Gfx12Lib::SanityCheckSurfSize +* +* @brief +* Calculate the surface size via the exact hardware algorithm to see if it matches. +* +* @return +************************************************************************************************************************ +*/ +void Gfx12Lib::SanityCheckSurfSize( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + const ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut + ) const +{ +#if DEBUG + // Verify that the requested image size is valid for the below algorithm. The below code includes + // implicit assumptions about the surface dimensions being less than "MaxImageDim"; otherwise, it can't + // calculate "firstMipInTail" accurately and the below assertion will trip incorrectly. + // + // Surfaces destined for use only on the SDMA engine can exceed the gfx-engine-imposed limitations of + // the "maximum" image dimensions. + if ((pIn->width <= MaxImageDim) && + (pIn->height <= MaxImageDim) && + (pIn->numMipLevels <= MaxMipLevels) && + (UseCustomPitch(pIn) == FALSE) && + (UseCustomHeight(pIn) == FALSE) && + // HiZS surfaces have a reduced image size (i.e,. each pixel represents an 8x8 region of the parent + // image, at least for single samples) but they still have the same number of mip levels as the + // parent image. This disconnect produces false assertions below as the image size doesn't apparently + // support the specified number of mip levels. + ((pIn->flags.hiZHiS == 0) || (pIn->numMipLevels == 1))) + { + UINT_32 lastMipSize = 1; + UINT_32 dataChainSize = 0; + + const ADDR_EXTENT3D mip0Dims = GetBaseMipExtents(pIn); + const UINT_32 blockSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode); + const ADDR_EXTENT3D tailMaxDim = GetMipTailDim(pIn->swizzleMode, pOut->blockExtent); + const UINT_32 maxMipsInTail = GetMaxNumMipsInTail(pIn->swizzleMode, blockSizeLog2); + + UINT_32 firstMipInTail = 0; + for (INT_32 mipIdx = MaxMipLevels - 1; mipIdx >= 0; mipIdx--) + { + const ADDR_EXTENT3D mipExtents = GetMipExtent(mip0Dims, mipIdx); + + if ((mipExtents.width <= tailMaxDim.width) && + (mipExtents.height <= tailMaxDim.height) && + ((static_cast(pIn->numMipLevels) - mipIdx) < static_cast(maxMipsInTail))) + { + firstMipInTail = mipIdx; + } + } + + for (INT_32 mipIdx = firstMipInTail - 1; mipIdx >= -1; mipIdx--) + { + const ADDR_EXTENT3D mipExtents = GetMipExtent(mip0Dims, mipIdx); + const UINT_32 mipBlockWidth = ShiftCeil(mipExtents.width, Log2(pOut->blockExtent.width)); + const UINT_32 mipBlockHeight = ShiftCeil(mipExtents.height, Log2(pOut->blockExtent.height)); + + if (mipIdx < (static_cast(pIn->numMipLevels) - 1)) + { + dataChainSize += lastMipSize; + } + + if (mipIdx >= 0) + { + lastMipSize = 4 * lastMipSize + - ((mipBlockWidth & 1) ? mipBlockHeight : 0) + - ((mipBlockHeight & 1) ? mipBlockWidth : 0) + - ((mipBlockWidth & mipBlockHeight & 1) ? 1 : 0); + } + } + + if (CanTrimLinearPadding(pIn)) + { + ADDR_ASSERT((pOut->sliceSize * pOut->blockExtent.depth) <= (dataChainSize << blockSizeLog2)); + } + else + { + ADDR_ASSERT((pOut->sliceSize * pOut->blockExtent.depth) == (dataChainSize << blockSizeLog2)); + } + } +#endif +} + +} // V3 +} // Addr +} // namespace rocr diff --git a/src/image/addrlib/src/gfx12/gfx12addrlib.h b/src/image/addrlib/src/gfx12/gfx12addrlib.h new file mode 100644 index 000000000..902a60b82 --- /dev/null +++ b/src/image/addrlib/src/gfx12/gfx12addrlib.h @@ -0,0 +1,218 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +* SPDX-License-Identifier: MIT +* +***********************************************************************************************************************/ + +/** +************************************************************************************************************************ +* @file gfx12addrlib.h +* @brief Contains the Gfx12Lib class definition. +************************************************************************************************************************ +*/ + +#ifndef __GFX12_ADDR_LIB_H__ +#define __GFX12_ADDR_LIB_H__ + +#include "addrlib3.h" +#include "coord.h" +#include "gfx12SwizzlePattern.h" + +namespace rocr { +namespace Addr +{ +namespace V3 +{ + +/** +************************************************************************************************************************ +* @brief GFX12 specific settings structure. +************************************************************************************************************************ +*/ +struct Gfx12ChipSettings +{ + struct + { + // Misc configuration bits + UINT_32 reserved : 32; + }; +}; + +/** +************************************************************************************************************************ +* @brief GFX12 data surface type. +************************************************************************************************************************ +*/ + +/** +************************************************************************************************************************ +* @brief This class is the GFX12 specific address library +* function set. +************************************************************************************************************************ +*/ +class Gfx12Lib : public Lib +{ +public: + /// Creates Gfx12Lib object + static Addr::Lib* CreateObj(const Client* pClient) + { + VOID* pMem = Object::ClientAlloc(sizeof(Gfx12Lib), pClient); + return (pMem != NULL) ? new (pMem) Gfx12Lib(pClient) : NULL; + } + +protected: + Gfx12Lib(const Client* pClient); + virtual ~Gfx12Lib(); + + // Meta surfaces such as Hi-S/Z are essentially images on GFX12, so just return the max + // image alignment. + virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const { return 256 * 1024; } + + UINT_32 GetMaxNumMipsInTail( + Addr3SwizzleMode swizzleMode, + UINT_32 blockSizeLog2) const; + + BOOL_32 IsInMipTail( + const ADDR_EXTENT3D& mipTailDim, + const ADDR_EXTENT3D& mipDims, + UINT_32 maxNumMipsInTail, + UINT_32 numMipsToTheEnd) const + { + BOOL_32 inTail = ((mipDims.width <= mipTailDim.width) && + (mipDims.height <= mipTailDim.height) && + (numMipsToTheEnd <= maxNumMipsInTail)); + + return inTail; + } + + virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoordTiled( + const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const; + + virtual ADDR_E_RETURNCODE HwlComputeNonBlockCompressedView( + const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn, + ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT* pOut) const; + + virtual VOID HwlComputeSubResourceOffsetForSwizzlePattern( + const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn, + ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT* pOut) const; + + virtual ADDR_E_RETURNCODE HwlComputeSlicePipeBankXor( + const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT* pOut) const; + + virtual UINT_32 HwlGetEquationTableInfo(const ADDR_EQUATION** ppEquationTable) const + { + *ppEquationTable = m_equationTable; + + return m_numEquations; + } + +private: + Gfx12ChipSettings m_settings; + static const SwizzleModeFlags SwizzleModeTable[ADDR3_MAX_TYPE]; + + virtual ADDR_E_RETURNCODE HwlComputePipeBankXor( + const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn, + ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT* pOut) const override; + + virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) override; + + void SanityCheckSurfSize( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + const ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; + + UINT_32 m_numSwizzleBits; + + static const ADDR_EXTENT3D Block4K_Log2_3d[]; + static const ADDR_EXTENT3D Block64K_Log2_3d[]; + static const ADDR_EXTENT3D Block256K_Log2_3d[]; + + // Initialize equation table + VOID InitEquationTable(); + + VOID GetSwizzlePatternFromPatternInfo( + const ADDR_SW_PATINFO* pPatInfo, + ADDR_BIT_SETTING (&pSwizzle)[Log2Size256K]) const + { + memcpy(pSwizzle, + GFX12_SW_PATTERN_NIBBLE1[pPatInfo->nibble1Idx], + sizeof(GFX12_SW_PATTERN_NIBBLE1[pPatInfo->nibble1Idx])); + + memcpy(&pSwizzle[8], + GFX12_SW_PATTERN_NIBBLE2[pPatInfo->nibble2Idx], + sizeof(GFX12_SW_PATTERN_NIBBLE2[pPatInfo->nibble2Idx])); + + memcpy(&pSwizzle[12], + GFX12_SW_PATTERN_NIBBLE3[pPatInfo->nibble3Idx], + sizeof(GFX12_SW_PATTERN_NIBBLE3[pPatInfo->nibble3Idx])); + + memcpy(&pSwizzle[16], + GFX12_SW_PATTERN_NIBBLE4[pPatInfo->nibble4Idx], + sizeof(GFX12_SW_PATTERN_NIBBLE4[pPatInfo->nibble4Idx])); + } + + VOID ConvertSwizzlePatternToEquation( + UINT_32 elemLog2, + Addr3SwizzleMode swMode, + const ADDR_SW_PATINFO* pPatInfo, + ADDR_EQUATION* pEquation) const; + + ADDR_EXTENT3D GetBaseMipExtents( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn) const; + + ADDR_EXTENT3D GetBlockPixelDimensions( + Addr3SwizzleMode swizzleMode, + UINT_32 log2BytesPerPixel) const; + + virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const override; + + static ADDR_EXTENT3D GetMipExtent( + const ADDR_EXTENT3D& mip0, + UINT_32 mipId) + { + return { + ShiftCeil(Max(mip0.width, 1u), mipId), + ShiftCeil(Max(mip0.height, 1u), mipId), + ShiftCeil(Max(mip0.depth, 1u), mipId) + }; + } + + //# See 6.3 in //gfxip/gfx10/doc/architecture/ImageAddressing/gfx10_image_addressing.docx + // miptail is applied to only larger block size (4kb, 64kb, 256kb), so there is no miptail in linear and + // 256b_2d addressing since they are both 256b block. + BOOL_32 SupportsMipTail(Addr3SwizzleMode swizzleMode) const + { + return GetBlockSize(swizzleMode) > 256u; + } + + UINT_32 ComputeOffsetFromEquation( + const ADDR_EQUATION* pEq, + UINT_32 x, + UINT_32 y, + UINT_32 z, + UINT_32 s) const; + + const ADDR_SW_PATINFO* GetSwizzlePatternInfo( + Addr3SwizzleMode swizzleMode, + UINT_32 log2Elem, + UINT_32 numFrag) const; + + VOID GetMipOffset( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; + + VOID GetMipOrigin( + const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn, + const ADDR_EXTENT3D& mipExtentFirstInTail, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const; +}; + +} // V3 +} // Addr +} // namespace rocr +#endif diff --git a/src/image/addrlib/src/gfx9/gfx9addrlib.cpp b/src/image/addrlib/src/gfx9/gfx9addrlib.cpp index 3b62d2d78..d98fd8058 100644 --- a/src/image/addrlib/src/gfx9/gfx9addrlib.cpp +++ b/src/image/addrlib/src/gfx9/gfx9addrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -38,9 +21,9 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// - namespace rocr { -namespace Addr { +namespace Addr +{ /** ************************************************************************************************************************ @@ -363,6 +346,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeCmaskInfo( // Generate the CMASK address equation. pOut->equation.gfx9.num_bits = Min(32u, eq->getsize()); + bool checked = false; for (unsigned b = 0; b < pOut->equation.gfx9.num_bits; b++) { CoordTerm &bit = (*eq)[b]; @@ -727,6 +711,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeDccInfo( // Generate the DCC address equation. pOut->equation.gfx9.num_bits = Min(32u, eq->getsize()); + bool checked = false; for (unsigned b = 0; b < pOut->equation.gfx9.num_bits; b++) { CoordTerm &bit = (*eq)[b]; @@ -2409,6 +2394,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeBlock256Equation( ADDR_E_RETURNCODE ret = ADDR_OK; pEquation->numBits = 8; + pEquation->numBitComponents = 1; UINT_32 i = 0; for (; i < elementBytesLog2; i++) @@ -2735,6 +2721,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeThinEquation( } } + FillEqBitComponents(pEquation); pEquation->numBits = blockSizeLog2; } @@ -3012,6 +2999,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeThickEquation( } } + FillEqBitComponents(pEquation); pEquation->numBits = blockSizeLog2; } @@ -3722,7 +3710,9 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting( // Apply optional restrictions if (pIn->flags.needEquation) { - FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3)); + UINT_32 components = pIn->flags.allowExtEquation ? ADDR_MAX_EQUATION_COMP : + ADDR_MAX_LEGACY_EQUATION_COMP; + FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components); } if (allowedSwModeSet.value == Gfx9LinearSwModeMask) @@ -3763,6 +3753,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting( const UINT_32 ratioLow = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2); const UINT_32 ratioHi = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1); + const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u); UINT_32 minSizeBlk = AddrBlockMicro; UINT_64 minSize = 0; @@ -3770,7 +3761,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting( for (UINT_32 i = AddrBlockLinear; i < AddrBlockMaxTiledType; i++) { - if (IsBlockTypeAvaiable(allowedBlockSet, static_cast(i))) + if (Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { localIn.swizzleMode = swMode[i]; @@ -3788,7 +3779,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting( padSize[i] = localOut.surfSize; if ((minSize == 0) || - BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi)) + Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi)) { minSize = padSize[i]; minSizeBlk = i; @@ -3829,9 +3820,9 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting( for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++) { if ((i != minSizeBlk) && - IsBlockTypeAvaiable(allowedBlockSet, static_cast(i))) + Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast(i))) { - if (BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE) + if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE) { // Clear the block type if the memory waste is unacceptable allowedBlockSet.value &= ~(1u << (i - 1)); @@ -5227,4 +5218,4 @@ VOID Gfx9Lib::ComputeThinBlockDimension( } // V2 } // Addr -} // rocr \ No newline at end of file +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/gfx9/gfx9addrlib.h b/src/image/addrlib/src/gfx9/gfx9addrlib.h index 10ea35139..990a55db2 100644 --- a/src/image/addrlib/src/gfx9/gfx9addrlib.h +++ b/src/image/addrlib/src/gfx9/gfx9addrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -37,8 +20,10 @@ #include "coord.h" namespace rocr { -namespace Addr { -namespace V2 { +namespace Addr +{ +namespace V2 +{ /** ************************************************************************************************************************ @@ -647,7 +632,6 @@ class Gfx9Lib : public Lib } // V2 } // Addr -} // rocr - +} // namespace rocr #endif diff --git a/src/image/addrlib/src/r800/ciaddrlib.cpp b/src/image/addrlib/src/r800/ciaddrlib.cpp index 200f71589..81f39a239 100644 --- a/src/image/addrlib/src/r800/ciaddrlib.cpp +++ b/src/image/addrlib/src/r800/ciaddrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -39,7 +22,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// namespace rocr { -namespace Addr { +namespace Addr +{ /** **************************************************************************************************** @@ -2351,4 +2335,4 @@ BOOL_32 CiLib::CheckTcCompatibility( } // V1 } // Addr -} // rocr \ No newline at end of file +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/r800/ciaddrlib.h b/src/image/addrlib/src/r800/ciaddrlib.h index 894ddd321..997f0ba41 100644 --- a/src/image/addrlib/src/r800/ciaddrlib.h +++ b/src/image/addrlib/src/r800/ciaddrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -37,8 +20,10 @@ #include "siaddrlib.h" namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ /** **************************************************************************************************** @@ -204,5 +189,7 @@ class CiLib : public SiLib } // V1 } // Addr -} // rocr +} // namespace rocr #endif + + diff --git a/src/image/addrlib/src/r800/egbaddrlib.cpp b/src/image/addrlib/src/r800/egbaddrlib.cpp index c762ab934..ee9a0a9eb 100644 --- a/src/image/addrlib/src/r800/egbaddrlib.cpp +++ b/src/image/addrlib/src/r800/egbaddrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ /** @@ -32,8 +15,10 @@ #include "egbaddrlib.h" namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ /** **************************************************************************************************** @@ -1558,6 +1543,8 @@ ADDR_E_RETURNCODE EgBasedLib::ComputeMacroTileEquation( pEquation->xor2[bankBitStart + i] = equation.xor2[i]; pEquation->numBits++; } + + FillEqBitComponents(pEquation); } } } @@ -3133,6 +3120,7 @@ UINT_32 EgBasedLib::ComputePipeRotation( } + /** **************************************************************************************************** * EgBasedLib::ComputeBankRotation @@ -4156,4 +4144,4 @@ UINT_32 EgBasedLib::HwlStereoCheckRightOffsetPadding( } // V1 } // Addr -} // rocr +} // namespace rocr \ No newline at end of file diff --git a/src/image/addrlib/src/r800/egbaddrlib.h b/src/image/addrlib/src/r800/egbaddrlib.h index ebae1ad6d..4a203c7d3 100644 --- a/src/image/addrlib/src/r800/egbaddrlib.h +++ b/src/image/addrlib/src/r800/egbaddrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -36,8 +19,10 @@ #include "addrlib1.h" namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ /// Structures for functions struct CoordFromBankPipe { @@ -423,7 +408,6 @@ class EgBasedLib : public Lib } // V1 } // Addr -} // rocr - +} // namespace rocr #endif diff --git a/src/image/addrlib/src/r800/siaddrlib.cpp b/src/image/addrlib/src/r800/siaddrlib.cpp index cb3760559..4abbed2b9 100644 --- a/src/image/addrlib/src/r800/siaddrlib.cpp +++ b/src/image/addrlib/src/r800/siaddrlib.cpp @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -38,7 +21,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// namespace rocr { -namespace Addr { +namespace Addr +{ /** **************************************************************************************************** @@ -419,6 +403,7 @@ ADDR_E_RETURNCODE SiLib::ComputeBankEquation( } } } + FillEqBitComponents(pEquation); if ((pTileInfo->bankWidth == 1) && ((pTileInfo->pipeConfig == ADDR_PIPECFG_P4_32x32) || @@ -1661,7 +1646,9 @@ UINT_32 SiLib::HwlGetPitchAlignmentLinear( } else { - pitchAlign = Max(8u, 64 / BITS_TO_BYTES(bpp)); + { + pitchAlign = Max(8u, 64 / BITS_TO_BYTES(bpp)); + } } return pitchAlign; @@ -2279,7 +2266,10 @@ BOOL_32 SiLib::DecodeGbRegs( reg.val = pRegValue->gbAddrConfig; - switch (reg.f.pipe_interleave_size) + UINT_32 pipe_interleave_size = reg.f.pipe_interleave_size; + UINT_32 row_size = reg.f.row_size; + + switch (pipe_interleave_size) { case ADDR_CONFIG_PIPE_INTERLEAVE_256B: m_pipeInterleaveBytes = ADDR_PIPEINTERLEAVE_256B; @@ -2293,7 +2283,7 @@ BOOL_32 SiLib::DecodeGbRegs( break; } - switch (reg.f.row_size) + switch (row_size) { case ADDR_CONFIG_1KB_ROW: m_rowSize = ADDR_ROWSIZE_1KB; @@ -3869,4 +3859,4 @@ BOOL_32 SiLib::IsEquationSupported( } // V1 } // Addr -} // rocr +} // namespace rocr diff --git a/src/image/addrlib/src/r800/siaddrlib.h b/src/image/addrlib/src/r800/siaddrlib.h index d5f23d80a..c8de9b904 100644 --- a/src/image/addrlib/src/r800/siaddrlib.h +++ b/src/image/addrlib/src/r800/siaddrlib.h @@ -2,24 +2,7 @@ ************************************************************************************************************************ * * Copyright (C) 2007-2022 Advanced Micro Devices, Inc. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR -* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE +* SPDX-License-Identifier: MIT * ***********************************************************************************************************************/ @@ -37,8 +20,10 @@ #include "egbaddrlib.h" namespace rocr { -namespace Addr { -namespace V1 { +namespace Addr +{ +namespace V1 +{ /** **************************************************************************************************** @@ -84,8 +69,11 @@ struct SiChipSettings UINT_32 isPolaris10 : 1; UINT_32 isPolaris11 : 1; UINT_32 isPolaris12 : 1; + // VI fusion UINT_32 isVegaM : 1; UINT_32 isCarrizo : 1; + + UINT_32 : 2; }; /** @@ -339,6 +327,6 @@ class SiLib : public EgBasedLib } // V1 } // Addr -} // rocr +} // namespace rocr #endif diff --git a/src/image/blit_kernel.cpp b/src/image/blit_kernel.cpp index afbafed66..5b6d9cfae 100644 --- a/src/image/blit_kernel.cpp +++ b/src/image/blit_kernel.cpp @@ -105,6 +105,8 @@ extern uint8_t ocl_blit_object_gfx1102[]; extern uint8_t ocl_blit_object_gfx1103[]; extern uint8_t ocl_blit_object_gfx1150[]; extern uint8_t ocl_blit_object_gfx1151[]; +extern uint8_t ocl_blit_object_gfx1200[]; +extern uint8_t ocl_blit_object_gfx1201[]; // Arguments inserted by OCL compiler, all zero here. struct OCLHiddenArgs { @@ -1052,6 +1054,10 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name, *blit_code_object = ocl_blit_object_gfx1150; } else if (sname == "gfx1151") { *blit_code_object = ocl_blit_object_gfx1151; + } else if (sname == "gfx1200") { + *blit_code_object = ocl_blit_object_gfx1200; + } else if (sname == "gfx1201") { + *blit_code_object = ocl_blit_object_gfx1201; } else { return HSA_STATUS_ERROR_INVALID_ISA_NAME; } diff --git a/src/image/blit_src/CMakeLists.txt b/src/image/blit_src/CMakeLists.txt index 481adf81a..94ba26267 100644 --- a/src/image/blit_src/CMakeLists.txt +++ b/src/image/blit_src/CMakeLists.txt @@ -50,7 +50,7 @@ if (NOT DEFINED TARGET_DEVICES) set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810" "gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx940;gfx941;gfx942" "gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" - "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151") + "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1200;gfx1201") endif() set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE ) diff --git a/src/image/image_manager_gfx11.cpp b/src/image/image_manager_gfx11.cpp index 20a9eeaf1..427dab386 100644 --- a/src/image/image_manager_gfx11.cpp +++ b/src/image/image_manager_gfx11.cpp @@ -703,11 +703,6 @@ uint32_t ImageManagerGfx11::GetAddrlibSurfaceInfoNv( prefSettingsInput.forbiddenBlock.macroThick64KB = 1; prefSettingsInput.forbiddenBlock.micro = 1; prefSettingsInput.forbiddenBlock.var = 1; - } else { - // Debug setting, simplifies buffer alignment until language runtimes have official gfx10 - // support. - prefSettingsInput.forbiddenBlock.macroThin64KB = 1; - prefSettingsInput.forbiddenBlock.macroThick64KB = 1; } // but don't ever allow the 256b swizzle modes diff --git a/src/image/image_manager_gfx12.cpp b/src/image/image_manager_gfx12.cpp new file mode 100644 index 000000000..14a37b8fe --- /dev/null +++ b/src/image/image_manager_gfx12.cpp @@ -0,0 +1,896 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#define NOMINMAX +#include "image_manager_gfx12.h" + +#include + +#include +#include + +#include "inc/hsa_ext_amd.h" +#include "core/inc/hsa_internal.h" +#include "core/util/utils.h" +#include "addrlib/src/core/addrlib.h" +#include "image_runtime.h" +#include "resource.h" +#include "resource_gfx12.h" +#include "util.h" +#include "device_info.h" + +namespace rocr { +namespace image { + +static_assert(sizeof(SQ_BUF_RSRC_WORD0) == sizeof(uint32_t)); +static_assert(sizeof(SQ_BUF_RSRC_WORD1) == sizeof(uint32_t)); +static_assert(sizeof(SQ_BUF_RSRC_WORD2) == sizeof(uint32_t)); +static_assert(sizeof(SQ_BUF_RSRC_WORD3) == sizeof(uint32_t)); + +static_assert(sizeof(SQ_IMG_RSRC_WORD0) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD1) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD2) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD3) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD4) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD5) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD6) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_RSRC_WORD7) == sizeof(uint32_t)); + +static_assert(sizeof(SQ_IMG_SAMP_WORD0) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_SAMP_WORD1) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_SAMP_WORD2) == sizeof(uint32_t)); +static_assert(sizeof(SQ_IMG_SAMP_WORD3) == sizeof(uint32_t)); + +//----------------------------------------------------------------------------- +// Workaround switch to combined format/type codes and missing gfx11 +// specific look up table. Only covers types used in image_lut_gfx11.cpp. +//----------------------------------------------------------------------------- +struct formatconverstion_t { + FMT fmt; + type type; + FORMAT format; +}; + +// Format/Type to combined format code table. +// Sorted and indexed to allow fast searches. +static const formatconverstion_t FormatLUT[] = { + {FMT_1_5_5_5, TYPE_UNORM, CFMT_1_5_5_5_UNORM}, // 0 + {FMT_10_10_10_2, TYPE_UNORM, CFMT_10_10_10_2_UNORM}, // 1 + {FMT_10_10_10_2, TYPE_SNORM, CFMT_10_10_10_2_SNORM}, // 2 + {FMT_10_10_10_2, TYPE_UINT, CFMT_10_10_10_2_UINT}, // 3 + {FMT_10_10_10_2, TYPE_SINT, CFMT_10_10_10_2_SINT}, // 4 + {FMT_16, TYPE_UNORM, CFMT_16_UNORM}, // 5 + {FMT_16, TYPE_SNORM, CFMT_16_SNORM}, // 6 + {FMT_16, TYPE_UINT, CFMT_16_UINT}, // 7 + {FMT_16, TYPE_SINT, CFMT_16_SINT}, // 8 + {FMT_16, TYPE_FLOAT, CFMT_16_FLOAT}, // 9 + {FMT_16, TYPE_USCALED, CFMT_16_USCALED}, // 10 + {FMT_16, TYPE_SSCALED, CFMT_16_SSCALED}, // 11 + {FMT_16_16, TYPE_UNORM, CFMT_16_16_UNORM}, // 12 + {FMT_16_16, TYPE_SNORM, CFMT_16_16_SNORM}, // 13 + {FMT_16_16, TYPE_UINT, CFMT_16_16_UINT}, // 14 + {FMT_16_16, TYPE_SINT, CFMT_16_16_SINT}, // 15 + {FMT_16_16, TYPE_FLOAT, CFMT_16_16_FLOAT}, // 16 + {FMT_16_16, TYPE_USCALED, CFMT_16_16_USCALED}, // 17 + {FMT_16_16, TYPE_SSCALED, CFMT_16_16_SSCALED}, // 18 + {FMT_16_16_16_16, TYPE_UNORM, CFMT_16_16_16_16_UNORM}, // 19 + {FMT_16_16_16_16, TYPE_SNORM, CFMT_16_16_16_16_SNORM}, // 20 + {FMT_16_16_16_16, TYPE_UINT, CFMT_16_16_16_16_UINT}, // 21 + {FMT_16_16_16_16, TYPE_SINT, CFMT_16_16_16_16_SINT}, // 22 + {FMT_16_16_16_16, TYPE_FLOAT, CFMT_16_16_16_16_FLOAT}, // 23 + {FMT_16_16_16_16, TYPE_USCALED, CFMT_16_16_16_16_USCALED}, // 24 + {FMT_16_16_16_16, TYPE_SSCALED, CFMT_16_16_16_16_SSCALED}, // 25 + {FMT_2_10_10_10, TYPE_UNORM, CFMT_2_10_10_10_UNORM}, // 26 + {FMT_2_10_10_10, TYPE_SNORM, CFMT_2_10_10_10_SNORM}, // 27 + {FMT_2_10_10_10, TYPE_UINT, CFMT_2_10_10_10_UINT}, // 28 + {FMT_2_10_10_10, TYPE_SINT, CFMT_2_10_10_10_SINT}, // 29 + {FMT_2_10_10_10, TYPE_USCALED, CFMT_2_10_10_10_USCALED}, // 30 + {FMT_2_10_10_10, TYPE_SSCALED, CFMT_2_10_10_10_SSCALED}, // 31 + {FMT_24_8, TYPE_UNORM, CFMT_24_8_UNORM}, // 32 + {FMT_24_8, TYPE_UINT, CFMT_24_8_UINT}, // 33 + {FMT_32, TYPE_UINT, CFMT_32_UINT}, // 34 + {FMT_32, TYPE_SINT, CFMT_32_SINT}, // 35 + {FMT_32, TYPE_FLOAT, CFMT_32_FLOAT}, // 36 + {FMT_32_32, TYPE_UINT, CFMT_32_32_UINT}, // 37 + {FMT_32_32, TYPE_SINT, CFMT_32_32_SINT}, // 38 + {FMT_32_32, TYPE_FLOAT, CFMT_32_32_FLOAT}, // 39 + {FMT_32_32_32, TYPE_UINT, CFMT_32_32_32_UINT}, // 40 + {FMT_32_32_32, TYPE_SINT, CFMT_32_32_32_SINT}, // 41 + {FMT_32_32_32, TYPE_FLOAT, CFMT_32_32_32_FLOAT}, // 42 + {FMT_32_32_32_32, TYPE_UINT, CFMT_32_32_32_32_UINT}, // 43 + {FMT_32_32_32_32, TYPE_SINT, CFMT_32_32_32_32_SINT}, // 44 + {FMT_32_32_32_32, TYPE_FLOAT, CFMT_32_32_32_32_FLOAT}, // 45 + {FMT_5_5_5_1, TYPE_UNORM, CFMT_5_5_5_1_UNORM}, // 46 + {FMT_5_6_5, TYPE_UNORM, CFMT_5_6_5_UNORM}, // 47 + {FMT_8, TYPE_UNORM, CFMT_8_UNORM}, // 48 + {FMT_8, TYPE_SNORM, CFMT_8_SNORM}, // 49 + {FMT_8, TYPE_UINT, CFMT_8_UINT}, // 50 + {FMT_8, TYPE_SINT, CFMT_8_SINT}, // 51 + {FMT_8, TYPE_SRGB, CFMT_8_SRGB}, // 52 + {FMT_8, TYPE_USCALED, CFMT_8_USCALED}, // 53 + {FMT_8, TYPE_SSCALED, CFMT_8_SSCALED}, // 54 + {FMT_8_24, TYPE_UNORM, CFMT_8_24_UNORM}, // 55 + {FMT_8_24, TYPE_UINT, CFMT_8_24_UINT}, // 56 + {FMT_8_8, TYPE_UNORM, CFMT_8_8_UNORM}, // 57 + {FMT_8_8, TYPE_SNORM, CFMT_8_8_SNORM}, // 58 + {FMT_8_8, TYPE_UINT, CFMT_8_8_UINT}, // 59 + {FMT_8_8, TYPE_SINT, CFMT_8_8_SINT}, // 60 + {FMT_8_8, TYPE_SRGB, CFMT_8_8_SRGB}, // 61 + {FMT_8_8, TYPE_USCALED, CFMT_8_8_USCALED}, // 62 + {FMT_8_8, TYPE_SSCALED, CFMT_8_8_SSCALED}, // 63 + {FMT_8_8_8_8, TYPE_UNORM, CFMT_8_8_8_8_UNORM}, // 64 + {FMT_8_8_8_8, TYPE_SNORM, CFMT_8_8_8_8_SNORM}, // 65 + {FMT_8_8_8_8, TYPE_UINT, CFMT_8_8_8_8_UINT}, // 66 + {FMT_8_8_8_8, TYPE_SINT, CFMT_8_8_8_8_SINT}, // 67 + {FMT_8_8_8_8, TYPE_SRGB, CFMT_8_8_8_8_SRGB}, // 68 + {FMT_8_8_8_8, TYPE_USCALED, CFMT_8_8_8_8_USCALED}, // 69 + {FMT_8_8_8_8, TYPE_SSCALED, CFMT_8_8_8_8_SSCALED} // 70 +}; +static const int FormatLUTSize = sizeof(FormatLUT)/sizeof(formatconverstion_t); + +//Index in FormatLUT to start search, indexed by FMT enum. +static const int FormatEntryPoint[] = { + 71, // FMT_INVALID + 48, // FMT_8 + 5, // FMT_16 + 57, // FMT_8_8 + 34, // FMT_32 + 12, // FMT_16_16 + 71, // FMT_10_11_11 + 71, // FMT_11_11_10 + 1, // FMT_10_10_10_2 + 26, // FMT_2_10_10_10 + 64, // FMT_8_8_8_8 + 37, // FMT_32_32 + 19, // FMT_16_16_16_16 + 40, // FMT_32_32_32 + 43, // FMT_32_32_32_32 + 71, // RESERVED + 47, // FMT_5_6_5 + 0, // FMT_1_5_5_5 + 46, // FMT_5_5_5_1 + 71, // FMT_4_4_4_4 + 55, // FMT_8_24 + 32 // FMT_24_8 +}; + +static FORMAT GetCombinedFormat(uint8_t fmt, uint8_t type) { + assert(fmt < sizeof(FormatEntryPoint)/sizeof(int) && "FMT out of range."); + int start = FormatEntryPoint[fmt]; + int stop = std::min(start + 6, FormatLUTSize); // Only 6 types are used in image_kv_lut.cpp + + for(int i=start; i> 3) * out.pitch; + size_t slicePitch = rowPitch * out.height; + if (desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB && + image_data_layout == HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR && + ((image_data_row_pitch && (rowPitch != image_data_row_pitch)) || + (image_data_slice_pitch && (slicePitch != image_data_slice_pitch)))) { + return static_cast( + HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED); + } + + image_info.size = out.surfSize; + assert(image_info.size != 0); + image_info.alignment = out.baseAlign; + assert(image_info.alignment != 0); + + return HSA_STATUS_SUCCESS; +} + +bool ImageManagerGfx12::IsLocalMemory(const void* address) const { + return true; +} + +hsa_status_t ImageManagerGfx12::PopulateImageSrd(Image& image, + const metadata_amd_t* descriptor) const { + const metadata_amd_gfx12_t* desc = reinterpret_cast(descriptor); + const void* image_data_addr = image.data; + + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + if ((image_prop.cap == HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED) || + (image_prop.element_size == 0)) + return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED; + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + + if (IsLocalMemory(image.data)) { + image_data_addr = reinterpret_cast( + reinterpret_cast(image.data) - local_memory_base_address_); + } + + image.srd[0] = desc->word0.u32All; + image.srd[1] = desc->word1.u32All; + image.srd[2] = desc->word2.u32All; + image.srd[3] = desc->word3.u32All; + image.srd[4] = desc->word4.u32All; + image.srd[5] = desc->word5.u32All; + image.srd[6] = desc->word6.u32All; + image.srd[7] = desc->word7.u32All; + + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + SQ_BUF_RSRC_WORD0 word0; + SQ_BUF_RSRC_WORD1 word1; + SQ_BUF_RSRC_WORD3 word3; + + word0.val = 0; + word0.f.BASE_ADDRESS = PtrLow32(image_data_addr); + + word1.val = image.srd[1]; + word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr); + word1.f.STRIDE = image_prop.element_size; + + word3.val = image.srd[3]; + word3.f.DST_SEL_X = swizzle.x; + word3.f.DST_SEL_Y = swizzle.y; + word3.f.DST_SEL_Z = swizzle.z; + word3.f.DST_SEL_W = swizzle.w; + + word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + + word3.f.INDEX_STRIDE = image_prop.element_size; + + // New to GFX12 + //word3.f.WRITE_COMPRESS_ENABLE = 0; + //word3.f.COMPRESSION_EN = 0; + //word3.f.COMPRESSION_ACCESS_MODE = 0; + + image.srd[0] = word0.val; + image.srd[1] = word1.val; + image.srd[3] = word3.val; + } else { + uint32_t hwPixelSize = ImageLut().GetPixelSize(image_prop.data_format, image_prop.data_type); + + if (image_prop.element_size != hwPixelSize) { + return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED; + } + reinterpret_cast(&image.srd[0])->bits.BASE_ADDRESS = + PtrLow40Shift8(image_data_addr); + reinterpret_cast(&image.srd[1])->bits.BASE_ADDRESS_HI = + PtrHigh64Shift40(image_data_addr); + + // New to GFX12... + //reinterpret_cast(&image.srd[1])->bits.MAX_MIP = 0; + + reinterpret_cast(&image.srd[1])->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + reinterpret_cast(&image.srd[3])->bits.DST_SEL_X = + swizzle.x; + reinterpret_cast(&image.srd[3])->bits.DST_SEL_Y = + swizzle.y; + reinterpret_cast(&image.srd[3])->bits.DST_SEL_Z = + swizzle.z; + reinterpret_cast(&image.srd[3])->bits.DST_SEL_W = + swizzle.w; + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA || + image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1D) { + reinterpret_cast(&image.srd[3])->bits.TYPE = + ImageLut().MapGeometry(image.desc.geometry); + } + } + + // Looks like this is only used for CPU copies. + image.row_pitch = 0; + image.slice_pitch = 0; + + // Used by HSAIL shader ABI + image.srd[8] = image.desc.format.channel_type; + image.srd[9] = image.desc.format.channel_order; + image.srd[10] = static_cast(image.desc.width); + + return HSA_STATUS_SUCCESS; +} + +static TEX_BC_SWIZZLE GetBcSwizzle(const Swizzle& swizzle) { + SEL r = (SEL)swizzle.x; + SEL g = (SEL)swizzle.y; + SEL b = (SEL)swizzle.z; + SEL a = (SEL)swizzle.w; + + TEX_BC_SWIZZLE bcSwizzle = TEX_BC_Swizzle_XYZW; + + if (a == SEL_X) { + // Have to use either TEX_BC_Swizzle_WZYX or TEX_BC_Swizzle_WXYZ + // + // For the pre-defined border color values (white, opaque black, + // transparent black), the only thing that matters is that the alpha + // channel winds up in the correct place (because the RGB channels are + // all the same) so either of these TEX_BC_Swizzle enumerations will + // work. Not sure what happens with border color palettes. + if (b == SEL_Y) { + // ABGR + bcSwizzle = TEX_BC_Swizzle_WZYX; + } else if ((r == SEL_X) && (g == SEL_X) && (b == SEL_X)) { + // RGBA + bcSwizzle = TEX_BC_Swizzle_XYZW; + } else { + // ARGB + bcSwizzle = TEX_BC_Swizzle_WXYZ; + } + } else if (r == SEL_X) { + // Have to use either TEX_BC_Swizzle_XYZW or TEX_BC_Swizzle_XWYZ + if (g == SEL_Y) { + // RGBA + bcSwizzle = TEX_BC_Swizzle_XYZW; + } else if ((g == SEL_X) && (b == SEL_X) && (a == SEL_W)) { + // RGBA + bcSwizzle = TEX_BC_Swizzle_XYZW; + } else { + // RAGB + bcSwizzle = TEX_BC_Swizzle_XWYZ; + } + } else if (g == SEL_X) { + // GRAB, have to use TEX_BC_Swizzle_YXWZ + bcSwizzle = TEX_BC_Swizzle_YXWZ; + } else if (b == SEL_X) { + // BGRA, have to use TEX_BC_Swizzle_ZYXW + bcSwizzle = TEX_BC_Swizzle_ZYXW; + } + + return bcSwizzle; +} + + +hsa_status_t ImageManagerGfx12::PopulateImageSrd(Image& image) const { + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED); + assert(image_prop.element_size != 0); + + const void* image_data_addr = image.data; + + if (IsLocalMemory(image.data)) + image_data_addr = reinterpret_cast( + reinterpret_cast(image.data) - local_memory_base_address_); + + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + SQ_BUF_RSRC_WORD0 word0; + SQ_BUF_RSRC_WORD1 word1; + SQ_BUF_RSRC_WORD2 word2; + SQ_BUF_RSRC_WORD3 word3; + + word0.val = 0; + word0.f.BASE_ADDRESS = PtrLow32(image_data_addr); + + word1.val = 0; + word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr); + word1.f.STRIDE = image_prop.element_size; + + word1.f.SWIZZLE_ENABLE = 0; + + word2.f.NUM_RECORDS = image.desc.width * image_prop.element_size; + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + word3.val = 0; + word3.f.DST_SEL_X = swizzle.x; + word3.f.DST_SEL_Y = swizzle.y; + word3.f.DST_SEL_Z = swizzle.z; + word3.f.DST_SEL_W = swizzle.w; + word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + + word3.f.INDEX_STRIDE = image_prop.element_size; + + // New to GFX12 + //word3.f.WRITE_COMPRESS_ENABLE = 0; + //word3.f.COMPRESSION_EN = 0; + //word3.f.COMPRESSION_ACCESS_MODE = 0; + + word3.f.TYPE = ImageLut().MapGeometry(image.desc.geometry); + + image.srd[0] = word0.val; + image.srd[1] = word1.val; + image.srd[2] = word2.val; + image.srd[3] = word3.val; + + image.row_pitch = image.desc.width * image_prop.element_size; + image.slice_pitch = image.row_pitch; + } else { + SQ_IMG_RSRC_WORD0 word0; + SQ_IMG_RSRC_WORD1 word1; + SQ_IMG_RSRC_WORD2 word2; + SQ_IMG_RSRC_WORD3 word3; + SQ_IMG_RSRC_WORD4 word4; + SQ_IMG_RSRC_WORD5 word5; + SQ_IMG_RSRC_WORD5 word6; + SQ_IMG_RSRC_WORD5 word7; + + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0}; + + uint32_t swizzleMode = GetAddrlibSurfaceInfoNv( + image.component, image.desc, image.tile_mode, + image.row_pitch, image.slice_pitch, out); + if (swizzleMode == (uint32_t)(-1)) { + return HSA_STATUS_ERROR; + } + + assert((out.bpp / 8) == image_prop.element_size); + + const size_t row_pitch_size = out.pitch * image_prop.element_size; + + word0.f.BASE_ADDRESS = PtrLow40Shift8(image_data_addr); + + word1.val = 0; + word1.f.BASE_ADDRESS_HI = PtrHigh64Shift40(image_data_addr); + + // New to GFX12 + //word1.f.MAX_MIP = 0; + //word1.f.BASE_LEVEL = 0; + + word1.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + // Only take the lowest 2 bits of (image.desc.width - 1) + word1.f.WIDTH = BitSelect<0, 1>(image.desc.width - 1); + + word2.val = 0; + // Take the high 14 bits of (image.desc.width - 1) + word2.f.WIDTH_HI = BitSelect<2, 15>(image.desc.width - 1); + word2.f.HEIGHT = image.desc.height ? image.desc.height - 1 : 0; + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + word3.val = 0; + word3.f.DST_SEL_X = swizzle.x; + word3.f.DST_SEL_Y = swizzle.y; + word3.f.DST_SEL_Z = swizzle.z; + word3.f.DST_SEL_W = swizzle.w; + //word3.f.NO_EDGE_CLAMP = 0; // New to GFX12 + //word3.f.LAST_LEVEL = 0; // New to GFX12 + word3.f.SW_MODE = swizzleMode; + word3.f.BC_SWIZZLE = GetBcSwizzle(swizzle); + word3.f.TYPE = ImageLut().MapGeometry(image.desc.geometry); + + const bool image_array = + (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA || + image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DA || + image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DADEPTH); + const bool image_3d = (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_3D); + + word4.val = 0; + word4.f.DEPTH = + (image_array) // Doesn't hurt but isn't array_size already >0? + ? std::max(image.desc.array_size, static_cast(1)) - 1 + : (image_3d) ? image.desc.depth - 1 : 0; + + // For 1d, 2d and 2d-msaa this is pitch-1 + if (!image_array && !image_3d) { + uint32_t encPitch = out.pitch - 1; + word4.f.DEPTH = encPitch & 0x1fff; // 13 bits + word4.f.PITCH_MSB = (encPitch >> 13) & 0x3; // last 2 bits + } + + word5.val = 0; + word6.val = 0; + word7.val = 0; + + image.srd[0] = word0.val; + image.srd[1] = word1.val; + image.srd[2] = word2.val; + image.srd[3] = word3.val; + image.srd[4] = word4.val; + image.srd[5] = word5.val; + image.srd[6] = word6.val; + image.srd[7] = word7.val; + + image.row_pitch = row_pitch_size; + image.slice_pitch = out.sliceSize; + } + + image.srd[8] = image.desc.format.channel_type; + image.srd[9] = image.desc.format.channel_order; + image.srd[10] = static_cast(image.desc.width); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ImageManagerGfx12::ModifyImageSrd( + Image& image, hsa_ext_image_format_t& new_format) const { + image.desc.format = new_format; + + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED); + assert(image_prop.element_size != 0); + + if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + SQ_BUF_RSRC_WORD3* word3 = + reinterpret_cast(&image.srd[3]); + word3->bits.DST_SEL_X = swizzle.x; + word3->bits.DST_SEL_Y = swizzle.y; + word3->bits.DST_SEL_Z = swizzle.z; + word3->bits.DST_SEL_W = swizzle.w; + word3->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + } else { + SQ_IMG_RSRC_WORD1* word1 = + reinterpret_cast(&image.srd[1]); + word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type); + + const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order); + SQ_IMG_RSRC_WORD3* word3 = + reinterpret_cast(&image.srd[3]); + word3->bits.DST_SEL_X = swizzle.x; + word3->bits.DST_SEL_Y = swizzle.y; + word3->bits.DST_SEL_Z = swizzle.z; + word3->bits.DST_SEL_W = swizzle.w; + } + + image.srd[8] = image.desc.format.channel_type; + image.srd[9] = image.desc.format.channel_order; + image.srd[10] = static_cast(image.desc.width); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ImageManagerGfx12::PopulateSamplerSrd(Sampler& sampler) const { + const hsa_ext_sampler_descriptor_t sampler_descriptor = sampler.desc; + + SQ_IMG_SAMP_WORD0 word0; + SQ_IMG_SAMP_WORD1 word1; + SQ_IMG_SAMP_WORD2 word2; + SQ_IMG_SAMP_WORD3 word3; + + word0.u32All = 0; + switch (sampler_descriptor.address_mode) { + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + word0.bits.CLAMP_X = static_cast(SQ_TEX_CLAMP_LAST_TEXEL); + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: + word0.bits.CLAMP_X = static_cast(SQ_TEX_CLAMP_BORDER); + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + word0.bits.CLAMP_X = static_cast(SQ_TEX_MIRROR); + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: + case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: + word0.bits.CLAMP_X = static_cast(SQ_TEX_WRAP); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + word0.bits.CLAMP_Y = word0.bits.CLAMP_X; + word0.bits.CLAMP_Z = word0.bits.CLAMP_X; + word0.bits.FORCE_UNNORMALIZED = (sampler_descriptor.coordinate_mode == + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED); + + word1.u32All = 0; + word1.bits.MAX_LOD = 4095; + + word2.u32All = 0; + switch (sampler_descriptor.filter_mode) { + case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: + word2.bits.XY_MAG_FILTER = static_cast(SQ_TEX_XY_FILTER_POINT); + break; + case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: + word2.bits.XY_MAG_FILTER = static_cast(SQ_TEX_XY_FILTER_BILINEAR); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + word2.bits.XY_MIN_FILTER = word2.bits.XY_MAG_FILTER; + word2.bits.Z_FILTER = SQ_TEX_Z_FILTER_NONE; + word2.bits.MIP_FILTER = SQ_TEX_MIP_FILTER_NONE; + + word3.u32All = 0; + + // TODO: check this bit with HSAIL spec. + word3.bits.BORDER_COLOR_TYPE = SQ_TEX_BORDER_COLOR_TRANS_BLACK; + + sampler.srd[0] = word0.u32All; + sampler.srd[1] = word1.u32All; + sampler.srd[2] = word2.u32All; + sampler.srd[3] = word3.u32All; + + return HSA_STATUS_SUCCESS; +} + +uint32_t ImageManagerGfx12::GetAddrlibSurfaceInfoNv( + hsa_agent_t component, const hsa_ext_image_descriptor_t& desc, + Image::TileMode tileMode, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT& out) const { + const ImageProperty image_prop = + GetImageProperty(component, desc.format, desc.geometry); + + const AddrFormat addrlib_format = GetAddrlibFormat(image_prop); + + const uint32_t width = static_cast(desc.width); + const uint32_t height = static_cast(desc.height); + static const size_t kMinNumSlice = 1; + const uint32_t num_slice = static_cast( + std::max(kMinNumSlice, std::max(desc.array_size, desc.depth))); + + ADDR3_COMPUTE_SURFACE_INFO_INPUT in = {0}; + in.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_INPUT); + in.format = addrlib_format; + in.bpp = static_cast(image_prop.element_size) * 8; + in.width = width; + in.height = height; + in.numSlices = num_slice; + in.pitchInElement = image_data_row_pitch / image_prop.element_size; + + switch (desc.geometry) { + case HSA_EXT_IMAGE_GEOMETRY_1D: + case HSA_EXT_IMAGE_GEOMETRY_1DB: + case HSA_EXT_IMAGE_GEOMETRY_1DA: + in.resourceType = ADDR_RSRC_TEX_1D; + break; + + case HSA_EXT_IMAGE_GEOMETRY_2D: + case HSA_EXT_IMAGE_GEOMETRY_2DDEPTH: + case HSA_EXT_IMAGE_GEOMETRY_2DA: + case HSA_EXT_IMAGE_GEOMETRY_2DADEPTH: + in.resourceType = ADDR_RSRC_TEX_2D; + break; + + case HSA_EXT_IMAGE_GEOMETRY_3D: + in.resourceType = ADDR_RSRC_TEX_3D; + break; + } + in.flags.texture = 1; + + if (tileMode == Image::TileMode::LINEAR) + { + in.swizzleMode = ADDR3_LINEAR; + } else { + + /* + * AddrLib3 does not provide the best swizzle mode (unlike AddrLib2). + * Instead, client has to request the list of possible swizzle mode and + * then pick the best one for its needs (i.e. performance/space tradeoffs). + * + */ + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT swOut = { 0 }; + swOut.size = sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT); + + ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT swIn = { 0 }; + swIn.size = sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT); + swIn.flags = in.flags; + swIn.resourceType = in.resourceType; + swIn.bpp = in.bpp; + swIn.width = in.width; + swIn.height = in.height; + swIn.numSlices = in.numSlices; + swIn.numMipLevels = in.numMipLevels; + swIn.numSamples = in.numSamples; + /* + * Cannot leave it to 0 like GFX11 Addr2GetPreferredSurfaceSetting method + * as it triggers an ASSERT in AddrLib3 code. + * + * Setting it to 256K to allow for maximum number of swizzle mode in set + * returned (similar behaviour as GFX11). + * + */ + swIn.maxAlign = 256 * 1024; + + + if (ADDR_OK != Addr3GetPossibleSwizzleModes(addr_lib_, &swIn, &swOut)) { + debug_print("Addr3GetPossibleSwizzleModes failed!\n"); + return (uint32_t) -1; + } + + /* + * Remove any modes that the client does not want (if any). + */ + //swOut.validModes.sw***** = 0; + + + /* + * Pick the "best" swizzle mode. + * + * This algorithm is based on behaviour in GFX11 AddrLib and on + * GFX12 code in PAL (that is also based on the GFX11 behaviour). + * + * Ratio variables control the extra space that can be used to get a larger + * swizzle mode. + * + * ratioLow:ratioHi meanings: + * + * 2:1 ratio - same behaviour as GFX11. + * 3:2 ratio - would be equivalent if flag opt4space in GFX11 (not used in ROCr) + * 1:1 ratio - minimum size, not necessary best for performance + * + */ + const UINT_32 ratioLow = 2; + const UINT_32 ratioHigh = 1; + + // Same behaviour as GFX11, remove linear if height is 1. + if (in.height > 1) { + swOut.validModes.swLinear = 0; + } + + UINT_64 minSize = 0; + Addr3SwizzleMode bestSwizzle = ADDR3_MAX_TYPE; + + for (uint32_t i = ADDR3_LINEAR; i < ADDR3_MAX_TYPE; i++) { + + if (swOut.validModes.value & (1 << i)) { + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0}; + localOut.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT); + + in.swizzleMode = (Addr3SwizzleMode) i; + + if (ADDR_OK != Addr3ComputeSurfaceInfo(addr_lib_, &in, &localOut)) { + // Should not happen, if it does, ignore this swizzle mode. + debug_print("Addr3ComputeSurfaceInfo failed!\n"); + continue; + } + + UINT_64 surfaceSize = localOut.surfSize; + + if (bestSwizzle == ADDR3_MAX_TYPE) { + minSize = surfaceSize; + bestSwizzle = (Addr3SwizzleMode) i; + } else if ((surfaceSize * ratioHigh) <= (minSize * ratioLow)) { + bestSwizzle = (Addr3SwizzleMode) i; + } + } + } + + if (bestSwizzle < ADDR3_MAX_TYPE) { + in.swizzleMode = (Addr3SwizzleMode) bestSwizzle; + } else { + debug_print("Unable to find a valid swizzleMode for the surface!\n"); + return (uint32_t) -1; + } + } + + + out.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT); + + if (ADDR_OK != Addr3ComputeSurfaceInfo(addr_lib_, &in, &out)) { + return (uint32_t)(-1); + } + if (out.surfSize == 0) { + return (uint32_t)(-1); + } + + return in.swizzleMode; +} + +hsa_status_t ImageManagerGfx12::FillImage(const Image& image, const void* pattern, + const hsa_ext_image_region_t& region) { + if (BlitQueueInit().queue_ == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + Image* image_view = const_cast(&image); + + SQ_BUF_RSRC_WORD3* word3_buff = NULL; + SQ_IMG_RSRC_WORD3* word3_image = NULL; + uint32_t dst_sel_w_original = 0; + if (image_view->desc.format.channel_type == + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010) { + // Force GPU to ignore the last two bits (alpha bits). + if (image_view->desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) { + word3_buff = reinterpret_cast(&image_view->srd[3]); + dst_sel_w_original = word3_buff->bits.DST_SEL_W; + word3_buff->bits.DST_SEL_W = SEL_0; + } else { + word3_image = reinterpret_cast(&image_view->srd[3]); + dst_sel_w_original = word3_image->bits.DST_SEL_W; + word3_image->bits.DST_SEL_W = SEL_0; + } + } + + SQ_IMG_RSRC_WORD1* word1 = NULL; + uint32_t num_format_original = 0; + const void* new_pattern = pattern; + float fill_value[4] = {0}; + switch (image_view->desc.format.channel_order) { + case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA: + case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB: + case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX: + case HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA: { + // We do not have write support for SRGBA image, so convert pattern + // to standard form and treat the image as RGBA image. + const float* pattern_f = reinterpret_cast(pattern); + fill_value[0] = LinearToStandardRGB(pattern_f[0]); + fill_value[1] = LinearToStandardRGB(pattern_f[1]); + fill_value[2] = LinearToStandardRGB(pattern_f[2]); + fill_value[3] = pattern_f[3]; + new_pattern = fill_value; + + ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry); + + word1 = reinterpret_cast(&image_view->srd[1]); + num_format_original = word1->bits.FORMAT; + word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, TYPE_UNORM); + } break; + default: + break; + } + + hsa_status_t status = ImageRuntime::instance()->blit_kernel().FillImage( + blit_queue_, blit_code_catalog_, *image_view, new_pattern, region); + + // Revert back original configuration. + if (word3_buff != NULL) { + word3_buff->bits.DST_SEL_W = dst_sel_w_original; + } + + if (word3_image != NULL) { + word3_image->bits.DST_SEL_W = dst_sel_w_original; + } + + if (word1 != NULL) { + word1->bits.FORMAT = num_format_original; + } + + return status; +} + +} // namespace image +} // namespace rocr diff --git a/src/image/image_manager_gfx12.h b/src/image/image_manager_gfx12.h new file mode 100755 index 000000000..085dee9c9 --- /dev/null +++ b/src/image/image_manager_gfx12.h @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef EXT_IMAGE_IMAGE_MANAGER_GFX12_H_ +#define EXT_IMAGE_IMAGE_MANAGER_GFX12_H_ + +#include "addrlib/inc/addrinterface.h" +#include "image_lut_gfx11.h" +#include "image_manager_kv.h" + +namespace rocr { +namespace image { + +class ImageManagerGfx12 : public ImageManagerKv { + public: + ImageManagerGfx12(); + virtual ~ImageManagerGfx12(); + + /// @brief Calculate the size and alignment of the backing storage of an + /// image. + virtual hsa_status_t CalculateImageSizeAndAlignment( + hsa_agent_t component, const hsa_ext_image_descriptor_t& desc, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, size_t image_data_slice_pitch, + hsa_ext_image_data_info_t& image_info) const; + + /// @brief Fill image structure with device specific image object. + virtual hsa_status_t PopulateImageSrd(Image& image) const; + + /// @brief Fill image structure with device specific image object using the given format. + virtual hsa_status_t PopulateImageSrd(Image& image, const metadata_amd_t* desc) const; + + /// @brief Modify device specific image object according to the specified + /// new format. + virtual hsa_status_t ModifyImageSrd(Image& image, + hsa_ext_image_format_t& new_format) const; + + /// @brief Fill sampler structure with device specific sampler object. + virtual hsa_status_t PopulateSamplerSrd(Sampler& sampler) const; + + /// @brief Fill image backing storage using agent copy. + virtual hsa_status_t FillImage(const Image& image, const void* pattern, + const hsa_ext_image_region_t& region); + protected: + uint32_t GetAddrlibSurfaceInfoNv(hsa_agent_t component, + const hsa_ext_image_descriptor_t& desc, + Image::TileMode tileMode, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + ADDR3_COMPUTE_SURFACE_INFO_OUTPUT& out) const; + + bool IsLocalMemory(const void* address) const; + virtual const ImageLutGfx11& ImageLut() const { return image_lut_gfx11; }; + + private: + ImageLutGfx11 image_lut_gfx11; + DISALLOW_COPY_AND_ASSIGN(ImageManagerGfx12); +}; + +} // namespace image +} // namespace rocr +#endif // EXT_IMAGE_IMAGE_MANAGER_GFX12_H_ diff --git a/src/image/image_manager_nv.cpp b/src/image/image_manager_nv.cpp index 139a3755c..cb897f196 100755 --- a/src/image/image_manager_nv.cpp +++ b/src/image/image_manager_nv.cpp @@ -698,11 +698,6 @@ uint32_t ImageManagerNv::GetAddrlibSurfaceInfoNv( prefSettingsInput.forbiddenBlock.macroThick64KB = 1; prefSettingsInput.forbiddenBlock.micro = 1; prefSettingsInput.forbiddenBlock.var = 1; - } else { - // Debug setting, simplifies buffer alignment until language runtimes have official gfx10 - // support. - prefSettingsInput.forbiddenBlock.macroThin64KB = 1; - prefSettingsInput.forbiddenBlock.macroThick64KB = 1; } // but don't ever allow the 256b swizzle modes diff --git a/src/image/image_runtime.cpp b/src/image/image_runtime.cpp index dc4109cd1..3e015be94 100755 --- a/src/image/image_runtime.cpp +++ b/src/image/image_runtime.cpp @@ -55,6 +55,7 @@ #include "image_manager_ai.h" #include "image_manager_nv.h" #include "image_manager_gfx11.h" +#include "image_manager_gfx12.h" #include "device_info.h" namespace rocr { @@ -110,14 +111,22 @@ hsa_status_t ImageRuntime::CreateImageManager(hsa_agent_t agent, void* data) { ImageManager* image_manager; - if (major_ver >= 11) { + switch (major_ver) { + case 12: + image_manager = new ImageManagerGfx12(); + break; + case 11: image_manager = new ImageManagerGfx11(); - } else if (major_ver >= 10) { + break; + case 10: image_manager = new ImageManagerNv(); - } else if (major_ver >= 9) { + break; + case 9: image_manager = new ImageManagerAi(); - } else { + break; + default: image_manager = new ImageManagerKv(); + break; } hsa_error_code = image_manager->Initialize(agent); diff --git a/src/image/resource_gfx12.h b/src/image/resource_gfx12.h new file mode 100644 index 000000000..6b0bd5648 --- /dev/null +++ b/src/image/resource_gfx12.h @@ -0,0 +1,814 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef EXT_IMAGE_RESOURCE_GFX12_H_ +#define EXT_IMAGE_RESOURCE_GFX12_H_ + +#if defined(LITTLEENDIAN_CPU) +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +namespace rocr { +namespace image { + +/**********************************************************/ +/**********************************************************/ +#define SQ_BUF_RSC_WRD0_REG_SZ 32 +#define SQ_BUF_RSC_WRD0_BASE_ADDRESS_SZ 32 + +struct sq_buf_rsrc_word0_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : SQ_BUF_RSC_WRD0_BASE_ADDRESS_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : SQ_BUF_RSC_WRD0_BASE_ADDRESS_SZ; +#endif +}; + +union SQ_BUF_RSRC_WORD0 { + sq_buf_rsrc_word0_t bitfields, bits, f; + uint32_t val : SQ_BUF_RSC_WRD0_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; + +/***********/ + +/* Note: These registers are also defined/used in registers.h + * in SQ_BUF_RSRC_WORD*_GFX12 + */ +#define SQ_BUF_RSC_WRD1_REG_SZ 32 +#define SQ_BUF_RSC_WRD1_BASE_ADDRESS_HI_SZ 16 +#define SQ_BUF_RSC_WRD1_STRIDE_SZ 14 +#define SQ_BUF_RSC_WRD1_SWIZZLE_ENABLE_SZ 2 +struct sq_buf_rsrc_word1_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : SQ_BUF_RSC_WRD1_BASE_ADDRESS_HI_SZ; + unsigned int STRIDE : SQ_BUF_RSC_WRD1_STRIDE_SZ; + unsigned int SWIZZLE_ENABLE : SQ_BUF_RSC_WRD1_SWIZZLE_ENABLE_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : SQ_BUF_RSC_WRD1_SWIZZLE_ENABLE_SZ; + unsigned int STRIDE : SQ_BUF_RSC_WRD1_STRIDE_SZ; + unsigned int BASE_ADDRESS_HI : SQ_BUF_RSC_WRD1_BASE_ADDRESS_HI_SZ; +#endif +}; + +union SQ_BUF_RSRC_WORD1 { + sq_buf_rsrc_word1_t bitfields, bits, f; + uint32_t val : SQ_BUF_RSC_WRD1_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_BUF_RSC_WRD2_REG_SZ 32 +#define SQ_BUF_RSC_WRD2_NUM_RECORDS_SZ 32 +struct sq_buf_rsrc_word2_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_RECORDS : SQ_BUF_RSC_WRD2_NUM_RECORDS_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int NUM_RECORDS : SQ_BUF_RSC_WRD2_NUM_RECORDS_SZ; +#endif +}; +union SQ_BUF_RSRC_WORD2 { + sq_buf_rsrc_word2_t bitfields, bits, f; + uint32_t val : SQ_BUF_RSC_WRD2_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_BUF_RSC_WRD3_REG_SZ 32 +#define SQ_BUF_RSC_WRD3_DST_SEL_X_SZ 3 +#define SQ_BUF_RSC_WRD3_DST_SEL_Y_SZ 3 +#define SQ_BUF_RSC_WRD3_DST_SEL_Z_SZ 3 +#define SQ_BUF_RSC_WRD3_DST_SEL_W_SZ 3 +#define SQ_BUF_RSC_WRD3_FORMAT_SZ 6 +#define SQ_BUF_RSC_WRD3_INDEX_STRIDE_SZ 2 +#define SQ_BUF_RSC_WRD3_ADD_TID_ENABLE_SZ 1 +#define SQ_BUF_RSC_WRD3_WRITE_COMPRESS_ENABLE_SZ 1 +#define SQ_BUF_RSC_WRD3_COMPRESSION_EN_SZ 1 +#define SQ_BUF_RSC_WRD3_COMPRESSION_ACCESS_MODE_SZ 2 +#define SQ_BUF_RSC_WORD3_OOB_SELECT_SZ 2 +#define SQ_BUF_RSC_WRD3_TYPE_SZ 2 +struct sq_buf_rsrc_word3_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : SQ_BUF_RSC_WRD3_DST_SEL_X_SZ; + unsigned int DST_SEL_Y : SQ_BUF_RSC_WRD3_DST_SEL_Y_SZ; + unsigned int DST_SEL_Z : SQ_BUF_RSC_WRD3_DST_SEL_Z_SZ; + unsigned int DST_SEL_W : SQ_BUF_RSC_WRD3_DST_SEL_W_SZ; + unsigned int FORMAT : SQ_BUF_RSC_WRD3_FORMAT_SZ; + unsigned int : 3; + unsigned int INDEX_STRIDE : SQ_BUF_RSC_WRD3_INDEX_STRIDE_SZ; + unsigned int ADD_TID_ENABLE : SQ_BUF_RSC_WRD3_ADD_TID_ENABLE_SZ; + unsigned int WRITE_COMPRESS_ENABLE : SQ_BUF_RSC_WRD3_WRITE_COMPRESS_ENABLE_SZ; + unsigned int COMPRESSION_EN : SQ_BUF_RSC_WRD3_COMPRESSION_EN_SZ; + unsigned int COMPRESSION_ACCESS_MODE : SQ_BUF_RSC_WRD3_COMPRESSION_ACCESS_MODE_SZ; + unsigned int OOB_SELECT : SQ_BUF_RSC_WORD3_OOB_SELECT_SZ; + unsigned int TYPE : SQ_BUF_RSC_WRD3_TYPE_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : SQ_BUF_RSC_WRD3_TYPE_SZ; + unsigned int OOB_SELECT : SQ_BUF_RSC_WORD3_OOB_SELECT_SZ; + unsigned int COMPRESSION_ACCESS_MODE : SQ_BUF_RSC_WRD3_COMPRESSION_ACCESS_MODE_SZ; + unsigned int COMPRESSION_EN : SQ_BUF_RSC_WRD3_COMPRESSION_EN_SZ; + unsigned int WRITE_COMPRESS_ENABLE : SQ_BUF_RSC_WRD3_WRITE_COMPRESS_ENABLE_SZ; + unsigned int ADD_TID_ENABLE : SQ_BUF_RSC_WRD3_ADD_TID_ENABLE_SZ; + unsigned int INDEX_STRIDE : SQ_BUF_RSC_WRD3_INDEX_STRIDE_SZ; + unsigned int : 3; + unsigned int FORMAT : SQ_BUF_RSC_WRD3_FORMAT_SZ; + unsigned int DST_SEL_W : SQ_BUF_RSC_WRD3_DST_SEL_W_SZ; + unsigned int DST_SEL_Z : SQ_BUF_RSC_WRD3_DST_SEL_Z_SZ; + unsigned int DST_SEL_Y : SQ_BUF_RSC_WRD3_DST_SEL_Y_SZ; + unsigned int DST_SEL_X : SQ_BUF_RSC_WRD3_DST_SEL_X_SZ; +#endif +}; +union SQ_BUF_RSRC_WORD3 { + sq_buf_rsrc_word3_t bitfields, bits, f; + uint32_t val : SQ_BUF_RSC_WRD3_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +/**********************************************************/ +/**********************************************************/ +#define SQ_IMG_RSC_WRD0_REG_SZ 32 +#define SQ_IMG_RSC_WRD0_BASE_ADDRESS_SZ 32 +struct sq_img_rsrc_word0_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : SQ_IMG_RSC_WRD0_BASE_ADDRESS_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : SQ_IMG_RSC_WRD0_BASE_ADDRESS_SZ; +#endif +}; +union SQ_IMG_RSRC_WORD0 { + sq_img_rsrc_word0_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD0_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD1_REG_SZ 32 +#define SQ_IMG_RSC_WRD1_BASE_ADDRESS_HI_SZ 8 +#define SQ_IMG_RSC_WRD1_MAX_MIP_SZ 5 +#define SQ_IMG_RSC_WRD1_FORMAT_SZ 8 +#define SQ_IMG_RSC_WRD1_BASE_LEVEL_SZ 5 +#define SQ_IMG_RSC_WRD1_WIDTH_LO 2 + +struct sq_img_rsrc_word1_t{ +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : SQ_IMG_RSC_WRD1_BASE_ADDRESS_HI_SZ; + unsigned int : 4; + unsigned int MAX_MIP : SQ_IMG_RSC_WRD1_MAX_MIP_SZ; + unsigned int FORMAT : SQ_IMG_RSC_WRD1_FORMAT_SZ; + unsigned int BASE_LEVEL : SQ_IMG_RSC_WRD1_BASE_LEVEL_SZ; + unsigned int WIDTH : SQ_IMG_RSC_WRD1_WIDTH_LO; +#elif defined(BIGENDIAN_CPU) + unsigned int WIDTH : SQ_IMG_RSC_WRD1_WIDTH_LO; + unsigned int BASE_LEVEL : SQ_IMG_RSC_WRD1_BASE_LEVEL_SZ; + unsigned int FORMAT : SQ_IMG_RSC_WRD1_FORMAT_SZ; + unsigned int MAX_MIP : SQ_IMG_RSC_WRD1_MAX_MIP_SZ; + unsigned int : 4; + unsigned int BASE_ADDRESS_HI : SQ_IMG_RSC_WRD1_BASE_ADDRESS_HI_SZ; +#endif +}; +union SQ_IMG_RSRC_WORD1 { + sq_img_rsrc_word1_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD1_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD2_REG_SZ 32 +#define SQ_IMG_RSC_WRD2_WIDTH_HI_SZ 14 +#define SQ_IMG_RSC_WRD2_HEIGHT_SZ 16 +struct sq_img_rsrc_word2_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int WIDTH_HI : SQ_IMG_RSC_WRD2_WIDTH_HI_SZ; + unsigned int HEIGHT : SQ_IMG_RSC_WRD2_HEIGHT_SZ; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int HEIGHT : SQ_IMG_RSC_WRD2_HEIGHT_SZ; + unsigned int WIDTH_HI : SQ_IMG_RSC_WRD2_WIDTH_HI_SZ; +#endif +}; +union SQ_IMG_RSRC_WORD2 { + sq_img_rsrc_word2_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD2_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD3_REG_SZ 32 +#define SQ_IMG_RSC_WRD3_DST_SEL_X_SZ 3 +#define SQ_IMG_RSC_WRD3_DST_SEL_Y_SZ 3 +#define SQ_IMG_RSC_WRD3_DST_SEL_Z_SZ 3 +#define SQ_IMG_RSC_WRD3_DST_SEL_W_SZ 3 +#define SQ_IMG_RSC_WRD3_NO_EDGE_CLAMP_SZ 1 +#define SQ_IMG_RSC_WRD3_LAST_LEVEL_SZ 5 +#define SQ_IMG_RSC_WRD3_SW_MODE_SZ 5 +#define SQ_IMG_RSC_WRD3_BC_SWIZZLE_SZ 3 +#define SQ_IMG_RSC_WRD3_TYPE_SZ 4 +struct sq_img_rsrc_word3_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : SQ_IMG_RSC_WRD3_DST_SEL_X_SZ; + unsigned int DST_SEL_Y : SQ_IMG_RSC_WRD3_DST_SEL_Y_SZ; + unsigned int DST_SEL_Z : SQ_IMG_RSC_WRD3_DST_SEL_Z_SZ; + unsigned int DST_SEL_W : SQ_IMG_RSC_WRD3_DST_SEL_W_SZ; + unsigned int NO_EDGE_CLAMP : SQ_IMG_RSC_WRD3_NO_EDGE_CLAMP_SZ; + unsigned int : 2; + unsigned int LAST_LEVEL : SQ_IMG_RSC_WRD3_LAST_LEVEL_SZ; + unsigned int SW_MODE : SQ_IMG_RSC_WRD3_SW_MODE_SZ; + unsigned int BC_SWIZZLE : SQ_IMG_RSC_WRD3_BC_SWIZZLE_SZ; + unsigned int TYPE : SQ_IMG_RSC_WRD3_TYPE_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : SQ_IMG_RSC_WRD3_TYPE_SZ; + unsigned int BC_SWIZZLE : SQ_IMG_RSC_WRD3_BC_SWIZZLE_SZ; + unsigned int SW_MODE : SQ_IMG_RSC_WRD3_SW_MODE_SZ; + unsigned int LAST_LEVEL : SQ_IMG_RSC_WRD3_LAST_LEVEL_SZ; + unsigned int : 2; + unsigned int NO_EDGE_CLAMP : SQ_IMG_RSC_WRD3_NO_EDGE_CLAMP_SZ; + unsigned int DST_SEL_W : SQ_IMG_RSC_WRD3_DST_SEL_W_SZ; + unsigned int DST_SEL_Z : SQ_IMG_RSC_WRD3_DST_SEL_Z_SZ; + unsigned int DST_SEL_Y : SQ_IMG_RSC_WRD3_DST_SEL_Y_SZ; + unsigned int DST_SEL_X : SQ_IMG_RSC_WRD3_DST_SEL_X_SZ; +#endif +}; +union SQ_IMG_RSRC_WORD3 { + sq_img_rsrc_word3_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD3_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD4_REG_SZ 32 +#define SQ_IMG_RSC_WRD4_DEPTH_SZ 14 +#define SQ_IMG_RSC_WRD4_PITCH_MSB_SZ 2 +#define SQ_IMG_RSC_WRD4_BASE_ARR_SZ 13 +#define SQ_IMG_RSC_WRD4_BASE_ARRAY_MSB_SZ 1 + +struct sq_img_rsrc_word4_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int DEPTH : SQ_IMG_RSC_WRD4_DEPTH_SZ; + unsigned int PITCH_MSB : SQ_IMG_RSC_WRD4_PITCH_MSB_SZ; + unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ; + unsigned int BASE_ARRAY_MSB : SQ_IMG_RSC_WRD4_BASE_ARRAY_MSB_SZ; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int BASE_ARRAY_MSB : SQ_IMG_RSC_WRD4_BASE_ARRAY_MSB_SZ; + unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ; + unsigned int PITCH_MSB : SQ_IMG_RSC_WRD4_PITCH_MSB_SZ; + unsigned int DEPTH : SQ_IMG_RSC_WRD4_DEPTH_SZ; +#endif +}; +union SQ_IMG_RSRC_WORD4 { + sq_img_rsrc_word4_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD4_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD5_REG_SZ 32 +#define SQ_IMG_RSC_WRD5_UAV3D_SZ 1 +#define SQ_IMG_RSC_WRD5_DEPTH_SCALE_SZ 5 +#define SQ_IMG_RSC_WRD5_HEIGHT_SCALE_SZ 5 +#define SQ_IMG_RSC_WRD5_WIDTH_SCALE_SZ 5 // Combined two consecutive separate fields width[0:2] and width[3:4]. +#define SQ_IMG_RSC_WRD5_PERF_MOD_SZ 3 +#define SQ_IMG_RSC_WRD5_CORNER_SAMPLES_SZ 1 +#define SQ_IMG_RSC_WRD5_LINKED_RESOURCE_SZ 1 +#define SQ_IMG_RSC_WRD5_LOD_HWD_CNT_EN_SZ 1 +#define SQ_IMG_RSC_WRD5_MIN_LOD_LO_SZ 6 // lowest 6 bits of MIN_LOD (13 bit total) + +struct sq_img_rsrc_word5_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int : 4; + unsigned int UAV3D : SQ_IMG_RSC_WRD5_UAV3D_SZ; + unsigned int DEPTH_SCALE : SQ_IMG_RSC_WRD5_DEPTH_SCALE_SZ; + unsigned int HEIGHT_SCALE : SQ_IMG_RSC_WRD5_HEIGHT_SCALE_SZ; + unsigned int WIDTH_SCALE : SQ_IMG_RSC_WRD5_WIDTH_SCALE_SZ; + unsigned int PERF_MOD : SQ_IMG_RSC_WRD5_PERF_MOD_SZ; + unsigned int CORNER_SAMPLES : SQ_IMG_RSC_WRD5_CORNER_SAMPLES_SZ; + unsigned int LINKED_RESOURCE : SQ_IMG_RSC_WRD5_LINKED_RESOURCE_SZ; + unsigned int LOD_HWD_CNT_EN : SQ_IMG_RSC_WRD5_LOD_HWD_CNT_EN_SZ; + unsigned int MIN_LOD_LO : SQ_IMG_RSC_WRD5_MIN_LOD_LO_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int MIN_LOD_LO : SQ_IMG_RSC_WRD5_MIN_LOD_LO_SZ; + unsigned int LOD_HWD_CNT_EN : SQ_IMG_RSC_WRD5_LOD_HWD_CNT_EN_SZ; + unsigned int LINKED_RESOURCE : SQ_IMG_RSC_WRD5_LINKED_RESOURCE_SZ; + unsigned int CORNER_SAMPLES : SQ_IMG_RSC_WRD5_CORNER_SAMPLES_SZ; + unsigned int PERF_MOD : SQ_IMG_RSC_WRD5_PERF_MOD_SZ; + unsigned int WIDTH_SCALE : SQ_IMG_RSC_WRD5_WIDTH_SCALE_SZ; + unsigned int HEIGHT_SCALE : SQ_IMG_RSC_WRD5_HEIGHT_SCALE_SZ; + unsigned int DEPTH_SCALE : SQ_IMG_RSC_WRD5_DEPTH_SCALE_SZ; + unsigned int UAV3D : SQ_IMG_RSC_WRD5_UAV3D_SZ; + unsigned int : 4; +#endif +}; + +union SQ_IMG_RSRC_WORD5 { + sq_img_rsrc_word5_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD5_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD6_REG_SZ 32 + +#define SQ_IMG_RSC_WRD6_MIN_LOD_HI_SZ 7 +#define SQ_IMG_RSC_WRD5_COUNTER_BANK_ID_SZ 8 // 3 fields combined into bank_id +#define SQ_IMG_RSC_WRD6_MAX_UNCOMP_BLK_SZ_SZ 1 +#define SQ_IMG_RSC_WRD6_MAX_COMP_BLK_SZ_SZ 2 +#define SQ_IMG_RSC_WRD6_WRITE_COMPRESS_EN_SZ 1 +#define SQ_IMG_RSC_WRD6_COMPRESSION_ENABLE_SZ 1 +#define SQ_IMG_RSC_WRD6_COMPRESSION_ACCESS_MODE_SZ 2 +#define SQ_IMG_RSC_WRD6_SPECULATIVE_READ_SZ 2 + +struct sq_img_rsrc_word6_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int MIN_LOD_HI : SQ_IMG_RSC_WRD6_MIN_LOD_HI_SZ; + unsigned int COUNTER_BANK_ID : SQ_IMG_RSC_WRD5_COUNTER_BANK_ID_SZ; + unsigned int MAX_UNCOMP_BLK_SZ : SQ_IMG_RSC_WRD6_MAX_UNCOMP_BLK_SZ_SZ; + unsigned int : 1; + unsigned int MAX_COMP_BLK_SZ : SQ_IMG_RSC_WRD6_MAX_COMP_BLK_SZ_SZ; + unsigned int : 1; + unsigned int WRITE_COMPRESS_ENABLE : SQ_IMG_RSC_WRD6_WRITE_COMPRESS_EN_SZ; + unsigned int COMPRESSION_ENABLE : SQ_IMG_RSC_WRD6_COMPRESSION_ENABLE_SZ; + unsigned int COMPRESSION_ACCESS_MODE : SQ_IMG_RSC_WRD6_COMPRESSION_ACCESS_MODE_SZ; + unsigned int SPECULATIVE_READ : SQ_IMG_RSC_WRD6_SPECULATIVE_READ_SZ; + unsigned int : 6; +#elif defined(BIGENDIAN_CPU) + unsigned int : 6; + unsigned int SPECULATIVE_READ : SQ_IMG_RSC_WRD6_SPECULATIVE_READ_SZ; + unsigned int COMPRESSION_ACCESS_MODE : SQ_IMG_RSC_WRD6_COMPRESSION_ACCESS_MODE_SZ; + unsigned int COMPRESSION_ENABLE : SQ_IMG_RSC_WRD6_COMPRESSION_ENABLE_SZ; + unsigned int WRITE_COMPRESS_ENABLE : SQ_IMG_RSC_WRD6_WRITE_COMPRESS_EN_SZ; + unsigned int : 1; + unsigned int MAX_COMP_BLK_SZ : SQ_IMG_RSC_WRD6_MAX_COMP_BLK_SZ_SZ; + unsigned int : 1; + unsigned int MAX_UNCOMP_BLK_SZ : SQ_IMG_RSC_WRD6_MAX_UNCOMP_BLK_SZ_SZ; + unsigned int COUNTER_BANK_ID : SQ_IMG_RSC_WRD5_COUNTER_BANK_ID_SZ; + unsigned int MIN_LOD_HI : SQ_IMG_RSC_WRD6_MIN_LOD_HI_SZ; +#endif +}; +union SQ_IMG_RSRC_WORD6 { + sq_img_rsrc_word6_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD6_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_RSC_WRD7_REG_SZ 32 +struct sq_img_rsrc_word7_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int : 32; +#endif +}; +union SQ_IMG_RSRC_WORD7 { + sq_img_rsrc_word7_t bitfields, bits, f; + uint32_t val : SQ_IMG_RSC_WRD7_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ +/**********************************************************/ +/**********************************************************/ + + + + +#define SQ_IMG_SAMP_WORD0_REG_SZ 32 +#define SQ_IMG_SAMP_WORD0_CLAMP_X_SZ 3 +#define SQ_IMG_SAMP_WORD0_CLAMP_Y_SZ 3 +#define SQ_IMG_SAMP_WORD0_CLAMP_Z_SZ 3 +#define SQ_IMG_SAMP_WORD0_MAX_ANISO_RATIO_SZ 3 +#define SQ_IMG_SAMP_WORD0_DEPTH_COMPARE_FUNC_SZ 3 +#define SQ_IMG_SAMP_WORD0_FORCE_UNNORMALIZED_SZ 1 +#define SQ_IMG_SAMP_WORD0_ANISO_THRESHOLD_SZ 3 +#define SQ_IMG_SAMP_WORD0_MC_COORD_TRUNC_SZ 1 +#define SQ_IMG_SAMP_WORD0_FORCE_DEGAMMA_SZ 1 +#define SQ_IMG_SAMP_WORD0_ANISO_BIAS_SZ 6 +#define SQ_IMG_SAMP_WORD0_TRUNC_COORD_SZ 1 +#define SQ_IMG_SAMP_WORD0_DISABLE_CUBE_WRAP_SZ 1 +#define SQ_IMG_SAMP_WORD0_FILTER_MODE_SZ 2 +#define SQ_IMG_SAMP_WORD0_SKIP_DEGAMMA_SZ 1 +struct sq_img_samp_word0_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int CLAMP_X : SQ_IMG_SAMP_WORD0_CLAMP_X_SZ; + unsigned int CLAMP_Y : SQ_IMG_SAMP_WORD0_CLAMP_Y_SZ; + unsigned int CLAMP_Z : SQ_IMG_SAMP_WORD0_CLAMP_Z_SZ; + unsigned int MAX_ANISO_RATIO : SQ_IMG_SAMP_WORD0_MAX_ANISO_RATIO_SZ; + unsigned int DEPTH_COMPARE_FUNC : SQ_IMG_SAMP_WORD0_DEPTH_COMPARE_FUNC_SZ; + unsigned int FORCE_UNNORMALIZED : SQ_IMG_SAMP_WORD0_FORCE_UNNORMALIZED_SZ; + unsigned int ANISO_THRESHOLD : SQ_IMG_SAMP_WORD0_ANISO_THRESHOLD_SZ; + unsigned int MC_COORD_TRUNC : SQ_IMG_SAMP_WORD0_MC_COORD_TRUNC_SZ; + unsigned int FORCE_DEGAMMA : SQ_IMG_SAMP_WORD0_FORCE_DEGAMMA_SZ; + unsigned int ANISO_BIAS : SQ_IMG_SAMP_WORD0_ANISO_BIAS_SZ; + unsigned int TRUNC_COORD : SQ_IMG_SAMP_WORD0_TRUNC_COORD_SZ; + unsigned int DISABLE_CUBE_WRAP : SQ_IMG_SAMP_WORD0_DISABLE_CUBE_WRAP_SZ; + unsigned int FILTER_MODE : SQ_IMG_SAMP_WORD0_FILTER_MODE_SZ; + unsigned int SKIP_DEGAMMA : SQ_IMG_SAMP_WORD0_SKIP_DEGAMMA_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int SKIP_DEGAMMA : SQ_IMG_SAMP_WORD0_SKIP_DEGAMMA_SZ; + unsigned int FILTER_MODE : SQ_IMG_SAMP_WORD0_FILTER_MODE_SZ; + unsigned int DISABLE_CUBE_WRAP : SQ_IMG_SAMP_WORD0_DISABLE_CUBE_WRAP_SZ; + unsigned int TRUNC_COORD : SQ_IMG_SAMP_WORD0_TRUNC_COORD_SZ; + unsigned int ANISO_BIAS : SQ_IMG_SAMP_WORD0_ANISO_BIAS_SZ; + unsigned int FORCE_DEGAMMA : SQ_IMG_SAMP_WORD0_FORCE_DEGAMMA_SZ; + unsigned int MC_COORD_TRUNC : SQ_IMG_SAMP_WORD0_MC_COORD_TRUNC_SZ; + unsigned int ANISO_THRESHOLD : SQ_IMG_SAMP_WORD0_ANISO_THRESHOLD_SZ; + unsigned int FORCE_UNNORMALIZED : SQ_IMG_SAMP_WORD0_FORCE_UNNORMALIZED_SZ; + unsigned int DEPTH_COMPARE_FUNC : SQ_IMG_SAMP_WORD0_DEPTH_COMPARE_FUNC_SZ; + unsigned int MAX_ANISO_RATIO : SQ_IMG_SAMP_WORD0_MAX_ANISO_RATIO_SZ; + unsigned int CLAMP_Z : SQ_IMG_SAMP_WORD0_CLAMP_Z_SZ; + unsigned int CLAMP_Y : SQ_IMG_SAMP_WORD0_CLAMP_Y_SZ; + unsigned int CLAMP_X : SQ_IMG_SAMP_WORD0_CLAMP_X_SZ; +#endif +}; + +union SQ_IMG_SAMP_WORD0 { + sq_img_samp_word0_t bitfields, bits, f; + uint32_t val : SQ_IMG_SAMP_WORD0_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_SAMP_WORD1_REG_SZ 32 +#define SQ_IMG_SAMP_WORD1_MIN_LOD_SZ 13 +#define SQ_IMG_SAMP_WORD1_MAX_LOD_SZ 13 +#define SQ_IMG_SAMP_WORD1_PERF_Z_SZ 4 +struct sq_img_samp_word1_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int MIN_LOD : SQ_IMG_SAMP_WORD1_MIN_LOD_SZ; + unsigned int MAX_LOD : SQ_IMG_SAMP_WORD1_MAX_LOD_SZ; + unsigned int : 2; + unsigned int PERF_Z : SQ_IMG_SAMP_WORD1_PERF_Z_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int PERF_Z : SQ_IMG_SAMP_WORD1_PERF_Z_SZ; + unsigned int : 2; + unsigned int MAX_LOD : SQ_IMG_SAMP_WORD1_MAX_LOD_SZ; + unsigned int MIN_LOD : SQ_IMG_SAMP_WORD1_MIN_LOD_SZ; +#endif +}; + +union SQ_IMG_SAMP_WORD1 { + sq_img_samp_word1_t bitfields, bits, f; + uint32_t val : SQ_IMG_SAMP_WORD1_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +#define SQ_IMG_SAMP_WORD2_REG_SZ 32 +#define SQ_IMG_SAMP_WORD2_LOD_BIAS_SZ 14 +#define SQ_IMG_SAMP_WORD2_LOD_BIAS_SEC_SZ 6 +#define SQ_IMG_SAMP_WORD2_XY_MAG_FILTER_SZ 2 +#define SQ_IMG_SAMP_WORD2_XY_MIN_FILTER_SZ 2 +#define SQ_IMG_SAMP_WORD2_Z_FILTER_SZ 2 +#define SQ_IMG_SAMP_WORD2_MIP_FILTER_SZ 2 +#define SQ_IMG_SAMP_WORD2_ANISO_OVERRIDE_SZ 1 +#define SQ_IMG_SAMP_WORD2_PERF_MIP_LO_SZ 2 +struct sq_img_samp_word2_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int LOD_BIAS : SQ_IMG_SAMP_WORD2_LOD_BIAS_SZ; + unsigned int LOD_BIAS_SEC : SQ_IMG_SAMP_WORD2_LOD_BIAS_SEC_SZ; + unsigned int XY_MAG_FILTER : SQ_IMG_SAMP_WORD2_XY_MAG_FILTER_SZ; + unsigned int XY_MIN_FILTER : SQ_IMG_SAMP_WORD2_XY_MIN_FILTER_SZ; + unsigned int Z_FILTER : SQ_IMG_SAMP_WORD2_Z_FILTER_SZ; + unsigned int MIP_FILTER : SQ_IMG_SAMP_WORD2_MIP_FILTER_SZ; + unsigned int : 1; + unsigned int ANISO_OVERRIDE : SQ_IMG_SAMP_WORD2_ANISO_OVERRIDE_SZ; + unsigned int PERF_MIP_LO : SQ_IMG_SAMP_WORD2_PERF_MIP_LO_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int PERF_MIP_LO : SQ_IMG_SAMP_WORD2_PERF_MIP_LO_SZ; + unsigned int ANISO_OVERRIDE : SQ_IMG_SAMP_WORD2_ANISO_OVERRIDE_SZ; + unsigned int : 1; + unsigned int MIP_FILTER : SQ_IMG_SAMP_WORD2_MIP_FILTER_SZ; + unsigned int Z_FILTER : SQ_IMG_SAMP_WORD2_Z_FILTER_SZ; + unsigned int XY_MIN_FILTER : SQ_IMG_SAMP_WORD2_XY_MIN_FILTER_SZ; + unsigned int XY_MAG_FILTER : SQ_IMG_SAMP_WORD2_XY_MAG_FILTER_SZ; + unsigned int LOD_BIAS_SEC : SQ_IMG_SAMP_WORD2_LOD_BIAS_SEC_SZ; + unsigned int LOD_BIAS : SQ_IMG_SAMP_WORD2_LOD_BIAS_SZ; +#endif +}; + +union SQ_IMG_SAMP_WORD2 { + sq_img_samp_word2_t bitfields, bits, f; + uint32_t val : SQ_IMG_SAMP_WORD2_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + + +#define SQ_IMG_SAMP_WORD3_REG_SZ 32 +#define SQ_IMG_SAMP_WORD3_PERF_MIP_HI_SZ 2 +#define SQ_IMG_SAMP_WORD3_BORDER_COLOR_PTR_SZ 12 +#define SQ_IMG_SAMP_WORD3_BORD_COLOR_TYPE_SZ 2 + +struct sq_img_samp_word3_t { +#if defined(LITTLEENDIAN_CPU) + unsigned int PERF_MIP_HI : SQ_IMG_SAMP_WORD3_PERF_MIP_HI_SZ; + unsigned int : 16; + unsigned int BORDER_COLOR_PTR : SQ_IMG_SAMP_WORD3_BORDER_COLOR_PTR_SZ; + unsigned int BORDER_COLOR_TYPE : SQ_IMG_SAMP_WORD3_BORD_COLOR_TYPE_SZ; +#elif defined(BIGENDIAN_CPU) + unsigned int BORDER_COLOR_TYPE : SQ_IMG_SAMP_WORD3_BORD_COLOR_TYPE_SZ; + unsigned int BORDER_COLOR_PTR : SQ_IMG_SAMP_WORD3_BORDER_COLOR_PTR_SZ; + unsigned int : 16; + unsigned int PERF_MIP_HI : SQ_IMG_SAMP_WORD3_PERF_MIP_HI_SZ; +#endif +}; + +union SQ_IMG_SAMP_WORD3 { + sq_img_samp_word3_t bitfields, bits, f; + uint32_t val : SQ_IMG_SAMP_WORD3_REG_SZ; + uint32_t u32All; + int32_t i32All; + float f32All; +}; +/***********/ + +/**************************************************************/ +/**************************************************************/ +/**************************************************************/ + +typedef enum FMT { +FMT_INVALID = 0x00000000, +FMT_8 = 0x00000001, +FMT_16 = 0x00000002, +FMT_8_8 = 0x00000003, +FMT_32 = 0x00000004, +FMT_16_16 = 0x00000005, +FMT_10_11_11 = 0x00000006, +FMT_11_11_10 = 0x00000007, +FMT_10_10_10_2 = 0x00000008, +FMT_2_10_10_10 = 0x00000009, +FMT_8_8_8_8 = 0x0000000a, +FMT_32_32 = 0x0000000b, +FMT_16_16_16_16 = 0x0000000c, +FMT_32_32_32 = 0x0000000d, +FMT_32_32_32_32 = 0x0000000e, +FMT_RESERVED_78 = 0x0000000f, +FMT_5_6_5 = 0x00000010, +FMT_1_5_5_5 = 0x00000011, +FMT_5_5_5_1 = 0x00000012, +FMT_4_4_4_4 = 0x00000013, +FMT_8_24 = 0x00000014, +FMT_24_8 = 0x00000015, +FMT_X24_8_32 = 0x00000016, +FMT_RESERVED_155 = 0x00000017, +} FMT; + +typedef enum type { +TYPE_UNORM = 0x00000000, +TYPE_SNORM = 0x00000001, +TYPE_USCALED = 0x00000002, +TYPE_SSCALED = 0x00000003, +TYPE_UINT = 0x00000004, +TYPE_SINT = 0x00000005, +TYPE_SRGB = 0x00000006, +TYPE_FLOAT = 0x00000007, +TYPE_RESERVED_8 = 0x00000008, +TYPE_RESERVED_9 = 0x00000009, +TYPE_UNORM_UINT = 0x0000000a, +TYPE_REVERSED_UNORM = 0x0000000b, +TYPE_FLOAT_CLAMP = 0x0000000c, +} type; + +enum FORMAT { +CFMT_INVALID = 0, +CFMT_8_UNORM = 1, +CFMT_8_SNORM = 2, +CFMT_8_USCALED = 3, +CFMT_8_SSCALED = 4, +CFMT_8_UINT = 5, +CFMT_8_SINT = 6, +CFMT_16_UNORM = 7, +CFMT_16_SNORM = 8, +CFMT_16_USCALED = 9, +CFMT_16_SSCALED = 10, +CFMT_16_UINT = 11, +CFMT_16_SINT = 12, +CFMT_16_FLOAT = 13, +CFMT_8_8_UNORM = 14, +CFMT_8_8_SNORM = 15, +CFMT_8_8_USCALED = 16, +CFMT_8_8_SSCALED = 17, +CFMT_8_8_UINT = 18, +CFMT_8_8_SINT = 19, +CFMT_32_UINT = 20, +CFMT_32_SINT = 21, +CFMT_32_FLOAT = 22, +CFMT_16_16_UNORM = 23, +CFMT_16_16_SNORM = 24, +CFMT_16_16_USCALED = 25, +CFMT_16_16_SSCALED = 26, +CFMT_16_16_UINT = 27, +CFMT_16_16_SINT = 28, +CFMT_16_16_FLOAT = 29, +CFMT_10_11_11_FLOAT = 30, +CFMT_11_11_10_FLOAT = 31, +CFMT_10_10_10_2_UNORM = 32, +CFMT_10_10_10_2_SNORM = 33, +CFMT_10_10_10_2_UINT = 34, +CFMT_10_10_10_2_SINT = 35, +CFMT_2_10_10_10_UNORM = 36, +CFMT_2_10_10_10_SNORM = 37, +CFMT_2_10_10_10_USCALED = 38, +CFMT_2_10_10_10_SSCALED = 39, +CFMT_2_10_10_10_UINT = 40, +CFMT_2_10_10_10_SINT = 41, +CFMT_8_8_8_8_UNORM = 42, +CFMT_8_8_8_8_SNORM = 43, +CFMT_8_8_8_8_USCALED = 44, +CFMT_8_8_8_8_SSCALED = 45, +CFMT_8_8_8_8_UINT = 46, +CFMT_8_8_8_8_SINT = 47, +CFMT_32_32_UINT = 48, +CFMT_32_32_SINT = 49, +CFMT_32_32_FLOAT = 50, +CFMT_16_16_16_16_UNORM = 51, +CFMT_16_16_16_16_SNORM = 52, +CFMT_16_16_16_16_USCALED = 53, +CFMT_16_16_16_16_SSCALED = 54, +CFMT_16_16_16_16_UINT = 55, +CFMT_16_16_16_16_SINT = 56, +CFMT_16_16_16_16_FLOAT = 57, +CFMT_32_32_32_UINT = 58, +CFMT_32_32_32_SINT = 59, +CFMT_32_32_32_FLOAT = 60, +CFMT_32_32_32_32_UINT = 61, +CFMT_32_32_32_32_SINT = 62, +CFMT_32_32_32_32_FLOAT = 63, +CFMT_8_SRGB = 64, +CFMT_8_8_SRGB = 65, +CFMT_8_8_8_8_SRGB = 66, +CFMT_5_9_9_9_FLOAT = 67, +CFMT_5_6_5_UNORM = 68, +CFMT_1_5_5_5_UNORM = 69, +CFMT_5_5_5_1_UNORM = 70, +CFMT_4_4_4_4_UNORM = 71, +CFMT_4_4_UNORM = 72, +CFMT_1_UNORM = 73, +CFMT_1_REVERSED_UNORM = 74, +CFMT_32_FLOAT_CLAMP = 75, +CFMT_8_24_UNORM = 76, +CFMT_8_24_UINT = 77, +CFMT_24_8_UNORM = 78, +CFMT_24_8_UINT = 79, +CFMT_X24_8_32_UINT = 80, +CFMT_X24_8_32_FLOAT = 81, +}; + +typedef enum SEL { + SEL_0 = 0x00000000, + SEL_1 = 0x00000001, + SEL_X = 0x00000004, + SEL_Y = 0x00000005, + SEL_Z = 0x00000006, + SEL_W = 0x00000007, +} SEL; + +typedef enum SQ_RSRC_IMG_TYPE { + SQ_RSRC_IMG_1D = 0x00000008, + SQ_RSRC_IMG_2D = 0x00000009, + SQ_RSRC_IMG_3D = 0x0000000a, + SQ_RSRC_IMG_CUBE_ARRAY = 0x0000000b, + SQ_RSRC_IMG_1D_ARRAY = 0x0000000c, + SQ_RSRC_IMG_2D_ARRAY = 0x0000000d, + SQ_RSRC_IMG_2D_MSAA = 0x0000000e, + SQ_RSRC_IMG_2D_MSAA_ARRAY = 0x0000000f, +} SQ_RSRC_IMG_TYPE; + +typedef enum SQ_TEX_XY_FILTER { + SQ_TEX_XY_FILTER_POINT = 0x00000000, + SQ_TEX_XY_FILTER_BILINEAR = 0x00000001, + SQ_TEX_XY_FILTER_ANISO_POINT = 0x00000002, + SQ_TEX_XY_FILTER_ANISO_BILINEAR = 0x00000003, +} SQ_TEX_XY_FILTER; + +typedef enum SQ_TEX_Z_FILTER { + SQ_TEX_Z_FILTER_NONE = 0x00000000, + SQ_TEX_Z_FILTER_POINT = 0x00000001, + SQ_TEX_Z_FILTER_LINEAR = 0x00000002, +} SQ_TEX_Z_FILTER; + +typedef enum SQ_TEX_MIP_FILTER { + SQ_TEX_MIP_FILTER_NONE = 0x00000000, + SQ_TEX_MIP_FILTER_POINT = 0x00000001, + SQ_TEX_MIP_FILTER_LINEAR = 0x00000002, + SQ_TEX_MIP_FILTER_POINT_ANISO_ADJ__VI = 0x00000003, +} SQ_TEX_MIP_FILTER; + +typedef enum SQ_TEX_CLAMP { + SQ_TEX_WRAP = 0x00000000, + SQ_TEX_MIRROR = 0x00000001, + SQ_TEX_CLAMP_LAST_TEXEL = 0x00000002, + SQ_TEX_MIRROR_ONCE_LAST_TEXEL = 0x00000003, + SQ_TEX_CLAMP_HALF_BORDER = 0x00000004, + SQ_TEX_MIRROR_ONCE_HALF_BORDER = 0x00000005, + SQ_TEX_CLAMP_BORDER = 0x00000006, + SQ_TEX_MIRROR_ONCE_BORDER = 0x00000007, +} SQ_TEX_CLAMP; + +typedef enum SQ_TEX_BORDER_COLOR { + SQ_TEX_BORDER_COLOR_TRANS_BLACK = 0x00000000, + SQ_TEX_BORDER_COLOR_OPAQUE_BLACK = 0x00000001, + SQ_TEX_BORDER_COLOR_OPAQUE_WHITE = 0x00000002, + SQ_TEX_BORDER_COLOR_REGISTER = 0x00000003, +} SQ_TEX_BORDER_COLOR; + +typedef enum TEX_BC_SWIZZLE { +TEX_BC_Swizzle_XYZW = 0x00000000, +TEX_BC_Swizzle_XWYZ = 0x00000001, +TEX_BC_Swizzle_WZYX = 0x00000002, +TEX_BC_Swizzle_WXYZ = 0x00000003, +TEX_BC_Swizzle_ZYXW = 0x00000004, +TEX_BC_Swizzle_YXWZ = 0x00000005, +} TEX_BC_SWIZZLE; + +typedef struct metadata_amd_gfx12_s { + uint32_t version; // Must be 1 + uint32_t vendorID; // AMD + SQ_IMG_RSRC_WORD0 word0; + SQ_IMG_RSRC_WORD1 word1; + SQ_IMG_RSRC_WORD2 word2; + SQ_IMG_RSRC_WORD3 word3; + SQ_IMG_RSRC_WORD4 word4; + SQ_IMG_RSRC_WORD5 word5; + SQ_IMG_RSRC_WORD6 word6; + SQ_IMG_RSRC_WORD7 word7; + uint32_t mip_offsets[0]; +} metadata_amd_gfx12_t; + +} // namespace image +} // namespace rocr +#endif // EXT_IMAGE_RESOURCE_GFX12_H_ + diff --git a/src/image/util.h b/src/image/util.h index 8482e41a4..88cdf4ccc 100644 --- a/src/image/util.h +++ b/src/image/util.h @@ -99,7 +99,7 @@ static __forceinline void* _aligned_malloc(size_t size, size_t alignment) { return aligned_alloc(alignment, size); #else void* mem = NULL; - if (NULL != posix_memalign(&mem, alignment, size)) return NULL; + if (0 != posix_memalign(&mem, alignment, size)) return NULL; return mem; #endif } diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h index 51aa389a0..74f15d7d7 100644 --- a/src/inc/amd_hsa_elf.h +++ b/src/inc/amd_hsa_elf.h @@ -75,7 +75,8 @@ enum { ELFABIVERSION_AMDGPU_HSA_V2 = 0, ELFABIVERSION_AMDGPU_HSA_V3 = 1, ELFABIVERSION_AMDGPU_HSA_V4 = 2, - ELFABIVERSION_AMDGPU_HSA_V5 = 3 + ELFABIVERSION_AMDGPU_HSA_V5 = 3, + ELFABIVERSION_AMDGPU_HSA_V6 = 4, }; // AMDGPU specific e_flags. @@ -87,6 +88,7 @@ enum : unsigned { EF_AMDGPU_MACH_NONE = 0x000, // AMDGCN-based processors. + // clang-format off EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, @@ -127,13 +129,25 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, + EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, + EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, + EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, + EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, + EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055, + // clang-format on // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX942, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. @@ -159,8 +173,7 @@ enum : unsigned { // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values. // - // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4, - // ELFABIVERSION_AMDGPU_HSA_V5. + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. EF_AMDGPU_FEATURE_XNACK_V4 = 0x300, // XNACK is not supported. EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000, @@ -173,8 +186,7 @@ enum : unsigned { // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values. // - // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4, - // ELFABIVERSION_AMDGPU_HSA_V5. + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00, // SRAMECC is not supported. EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000, @@ -184,6 +196,21 @@ enum : unsigned { EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800, // SRAMECC is on. EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00, + + // Generic target versioning. This is contained in the list byte of EFLAGS. + EF_AMDGPU_GENERIC_VERSION = 0xff000000, + EF_AMDGPU_GENERIC_VERSION_OFFSET = 24, + EF_AMDGPU_GENERIC_VERSION_MIN = 1, + EF_AMDGPU_GENERIC_VERSION_MAX = 0xff, +}; + +// ELF Relocation types for AMDGPU. +enum : unsigned { + R_AMDGPU_ABS32_LO = 1, + R_AMDGPU_ABS32_HI = 2, + R_AMDGPU_ABS64 = 3, + R_AMDGPU_ABS32 = 6, + R_AMDGPU_RELATIVE64 = 13, }; } // end namespace ELF @@ -245,14 +272,14 @@ typedef enum { // ELF Symbol Flag Enumeration Values. #define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST -// AMD GPU Relocation Type Enumeration Values. -#define R_AMDGPU_NONE 0 -#define R_AMDGPU_32_LOW 1 -#define R_AMDGPU_32_HIGH 2 -#define R_AMDGPU_64 3 -#define R_AMDGPU_INIT_SAMPLER 4 -#define R_AMDGPU_INIT_IMAGE 5 -#define R_AMDGPU_RELATIVE64 13 +// Legacy/V1 AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_V1_NONE 0 +#define R_AMDGPU_V1_32_LOW 1 +#define R_AMDGPU_V1_32_HIGH 2 +#define R_AMDGPU_V1_64 3 +#define R_AMDGPU_V1_INIT_SAMPLER 4 +#define R_AMDGPU_V1_INIT_IMAGE 5 +#define R_AMDGPU_V1_RELATIVE64 13 // AMD GPU Note Type Enumeration Values. #define NT_AMD_HSA_CODE_OBJECT_VERSION 1 diff --git a/src/inc/hsa.h b/src/inc/hsa.h index 9520bd870..1ad714c44 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -598,10 +598,14 @@ typedef enum { * AqlProfile extension. */ HSA_EXTENSION_AMD_AQLPROFILE = 0x202, + /** + * PC Sampling extension. + */ + HSA_EXTENSION_AMD_PC_SAMPLING = 0x203, /** * Last AMD extension. */ - HSA_AMD_LAST_EXTENSION = 0x202 + HSA_AMD_LAST_EXTENSION = 0x203 } hsa_extension_t; /** @@ -5656,7 +5660,12 @@ typedef enum { * undefined if the symbol is not an indirect function. The type of this * attribute is uint32_t. */ - HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16, + /** + * Wavefront size used by the kernel. The value of this attribute is either + * 32 or 64. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19 } hsa_code_symbol_info_t; /** diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h index e46c777af..2a0f59df3 100644 --- a/src/inc/hsa_api_trace.h +++ b/src/inc/hsa_api_trace.h @@ -44,39 +44,26 @@ #define HSA_RUNTIME_INC_HSA_API_TRACE_H #include "hsa.h" +#include "hsa_api_trace_version.h" #ifdef AMD_INTERNAL_BUILD #include "hsa_ext_image.h" #include "hsa_ext_amd.h" #include "hsa_ext_finalize.h" #include "hsa_amd_tool.h" +#include "hsa_ven_amd_pc_sampling.h" #else #include "inc/hsa_ext_image.h" #include "inc/hsa_ext_amd.h" #include "inc/hsa_ext_finalize.h" #include "inc/hsa_amd_tool.h" +#include "inc/hsa_ven_amd_pc_sampling.h" #endif #include #include #include -// Major Ids of the Api tables exported by Hsa Core Runtime -#define HSA_API_TABLE_MAJOR_VERSION 0x03 -#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 -#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 - -// Step Ids of the Api tables exported by Hsa Core Runtime -#define HSA_API_TABLE_STEP_VERSION 0x00 -#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x01 -#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 -#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 -#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h // Min function used to copy Api Tables static inline uint32_t Min(const uint32_t a, const uint32_t b) { @@ -191,6 +178,19 @@ struct ImageExtTable { decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn; }; +// Table to export HSA PC Sampling Extension Apis +struct PcSamplingExtTable { + ApiTableVersion version; + decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn; + decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn; + decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn; + decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn; + decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn; + decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn; + decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn; +}; + + // Table to export AMD Extension Apis struct AmdExtTable { ApiTableVersion version; @@ -263,6 +263,8 @@ struct AmdExtTable { decltype(hsa_amd_vmem_get_alloc_properties_from_handle)* hsa_amd_vmem_get_alloc_properties_from_handle_fn; decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn; + decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn; + decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn; }; // Table to export HSA Core Runtime Apis @@ -464,6 +466,9 @@ struct HsaApiTable { // Table of function pointers for tools to use ToolsApiTable* tools_; + + // Table of function pointers to AMD PC Sampling Extension + PcSamplingExtTable* pc_sampling_ext_; }; // Structure containing instances of different api tables @@ -474,6 +479,7 @@ struct HsaApiTableContainer { FinalizerExtTable finalizer_ext; ImageExtTable image_ext; ToolsApiTable tools; + PcSamplingExtTable pc_sampling_ext; // Default initialization of a container instance HsaApiTableContainer() { @@ -505,6 +511,11 @@ struct HsaApiTableContainer { tools.version.minor_id = sizeof(ToolsApiTable); tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION; root.tools_ = &tools; + + pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION; + pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable); + pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; + root.pc_sampling_ext_ = &pc_sampling_ext; } }; @@ -562,5 +573,7 @@ static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) { copyElement(&dest->image_ext_->version, &src->image_ext_->version); if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id)) copyElement(&dest->tools_->version, &src->tools_->version); + if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id)) + copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version); } #endif diff --git a/src/inc/hsa_api_trace_version.h b/src/inc/hsa_api_trace_version.h new file mode 100644 index 000000000..3393a7762 --- /dev/null +++ b/src/inc/hsa_api_trace_version.h @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H + +// CODE IN THIS FILE **MUST** BE C-COMPATIBLE + +// Major Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_MAJOR_VERSION 0x03 +#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION 0x01 + +// Step Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_STEP_VERSION 0x01 +#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x03 +#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 +#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 +#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION 0x00 + +#endif // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 187bcd958..f9f60edeb 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -47,16 +47,19 @@ #include "hsa.h" #include "hsa_ext_image.h" +#include "hsa_ven_amd_pc_sampling.h" -/* +/** * - 1.0 - initial version * - 1.1 - dmabuf export * - 1.2 - hsa_amd_memory_async_copy_on_engine * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool * - 1.4 - Virtual Memory API + * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES + * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align */ #define HSA_AMD_INTERFACE_VERSION_MAJOR 1 -#define HSA_AMD_INTERFACE_VERSION_MINOR 4 +#define HSA_AMD_INTERFACE_VERSION_MINOR 6 #ifdef __cplusplus extern "C" { @@ -221,6 +224,11 @@ enum { * Exceeded number of VGPRs available on this agent */ HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45, + + /** + * Resource is busy or temporarily unavailable + */ + HSA_STATUS_ERROR_RESOURCE_BUSY = 46, }; /** @@ -1176,7 +1184,11 @@ typedef enum hsa_amd_memory_pool_flag_s { * connection. Atomic memory operations on these memory buffers are not * guaranteed to be visible at system scope. */ - HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1, + HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0), + /** + * Allocates physically contiguous memory + */ + HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1), } hsa_amd_memory_pool_flag_t; @@ -2783,7 +2795,7 @@ hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* d */ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); -/* +/** * @brief Allocate a reserved address range * * Reserve a virtual address range. The size must be a multiple of the system page size. @@ -2803,11 +2815,39 @@ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address * range of this size. + * + * Note that this API will be deprecated in a future release and replaced by + * hsa_amd_vmem_address_reserve_align */ hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address, uint64_t flags); -/* +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2 + * @param[in] flags currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + */ +hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags); + +/** * @brief Free a reserved address range * * Free a previously allocated address range. The size must match the size of a previously @@ -2841,7 +2881,7 @@ typedef enum { MEMORY_TYPE_PINNED, } hsa_amd_memory_type_t; -/* +/** * @brief Create a virtual memory handle * * Create a virtual memory handle within this pool @@ -2870,7 +2910,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, hsa_amd_memory_type_t type, uint64_t flags, hsa_amd_vmem_alloc_handle_t* memory_handle); -/* +/** * @brief Release a virtual memory handle * * @param[in] memory handle that was previously allocated @@ -2881,7 +2921,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, */ hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle); -/* +/** * @brief Map a virtual memory handle * * Map a virtual memory handle to a reserved address range. The virtual address requested must be @@ -2907,7 +2947,7 @@ hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_hand hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset, hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags); -/* +/** * @brief Unmap a virtual memory handle * * Unmap previously mapped virtual address range @@ -2930,7 +2970,7 @@ typedef struct hsa_amd_memory_access_desc_s { hsa_agent_t agent_handle; } hsa_amd_memory_access_desc_t; -/* +/** * @brief Make a memory mapping accessible * * Make previously mapped virtual address accessible to specific agents. @p size must be equal to @@ -2959,7 +2999,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, const hsa_amd_memory_access_desc_t* desc, size_t desc_cnt); -/* +/** * @brief Get current access permissions for memory mapping * * Get access permissions for memory mapping for specific agent. @@ -2980,7 +3020,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, hsa_agent_t agent_handle); -/* +/** * @brief Get an exportable shareable handle * * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to @@ -3003,7 +3043,7 @@ hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, hsa_amd_vmem_alloc_handle_t handle, uint64_t flags); -/* +/** * @brief Import a shareable handle * * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed @@ -3023,7 +3063,7 @@ hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, hsa_amd_vmem_alloc_handle_t* handle); -/* +/** * @brief Returns memory handle for mapped memory * * Return a memory handle for previously mapped memory. The handle will be the same value of handle @@ -3040,19 +3080,19 @@ hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle, void* addr); -/* -* @brief Returns the current allocation properties of a handle -* -* Returns the allocation properties of an existing handle -* -* @param[in] memory_handle memory handle to be queried -* @param[out] pool memory pool that owns this handle -* @param[out] memory type - -* @retval ::HSA_STATUS_SUCCESS -* -* @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle -*/ +/** + * @brief Returns the current allocation properties of a handle + * + * Returns the allocation properties of an existing handle + * + * @param[in] memory_handle memory handle to be queried + * @param[out] pool memory pool that owns this handle + * @param[out] memory type + + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle + */ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool, hsa_amd_memory_type_t* type); @@ -3084,6 +3124,22 @@ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( */ hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold); +typedef enum { + /* + * Returns the agent that owns the underlying HW queue. + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_QUEUE_INFO_AGENT, + /* + * Returns the doorbell ID of the completion signal of the queue + * The type of this attribute is uint64_t. + */ + HSA_AMD_QUEUE_INFO_DOORBELL_ID, +} hsa_queue_info_attribute_t; + +hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, + void* value); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/src/inc/hsa_ven_amd_aqlprofile.h b/src/inc/hsa_ven_amd_aqlprofile.h index 32ca6b732..0022c0d8b 100644 --- a/src/inc/hsa_ven_amd_aqlprofile.h +++ b/src/inc/hsa_ven_amd_aqlprofile.h @@ -149,61 +149,61 @@ hsa_status_t hsa_ven_amd_aqlprofile_validate_event( // All parameters are generic and if not applicable for a specific // profile configuration then error status will be returned. typedef enum { - /* - * Select the target compute unit (wgp) for profiling. - */ + /** + * Select the target compute unit (wgp) for profiling. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0, - /* - * VMID Mask - */ + /** + * VMID Mask + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4, - /* - * Shader engine mask for selection. - */ + /** + * Shader engine mask for selection. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7, - /* - * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) - */ + /** + * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8, - /* - * Set true for occupancy collection only. - */ + /** + * Set true for occupancy collection only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9, - /* - * ATT collection max data size, in MB. Shared among shader engines. - */ + /** + * ATT collection max data size, in MB. Shared among shader engines. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10, - /* - * Mask of which compute units to generate perfcounters. GFX9 only. - */ + /** + * Mask of which compute units to generate perfcounters. GFX9 only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240, - /* - * Select collection period for perfcounters. GFX9 only. - */ + /** + * Select collection period for perfcounters. GFX9 only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241, - /* - * Select perfcounter ID (SQ block) for collection. GFX9 only. - */ + /** + * Select perfcounter ID (SQ block) for collection. GFX9 only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242, } hsa_ven_amd_aqlprofile_parameter_name_t; @@ -365,11 +365,11 @@ hsa_status_t hsa_ven_amd_aqlprofile_error_string( /** * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. -*/ + */ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name); /** * @brief Iterate over all possible event coordinate IDs and their names. -*/ + */ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t); /** @@ -380,7 +380,7 @@ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eve * @param coordinate The coordinate, in the range [0,extent-1]. * @param name Coordinate name as in _iterate_event_ids. * @param userdata Userdata returned from _iterate_event_coord function. -*/ + */ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( int position, int id, @@ -397,7 +397,7 @@ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data. * @param[in] callback Callback function to return the coordinates. * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. -*/ + */ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord( hsa_agent_t agent, hsa_ven_amd_aqlprofile_event_t event, diff --git a/src/inc/hsa_ven_amd_pc_sampling.h b/src/inc/hsa_ven_amd_pc_sampling.h new file mode 100644 index 000000000..019f0ea5c --- /dev/null +++ b/src/inc/hsa_ven_amd_pc_sampling.h @@ -0,0 +1,416 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_VEN_AMD_PC_SAMPLING_H +#define HSA_VEN_AMD_PC_SAMPLING_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + + +/** + * @brief HSA AMD Vendor PC Sampling APIs + * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be + * modified extensively in the future + */ + +/** + * @brief PC Sampling sample data for hosttrap sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t reserved0; + uint64_t reserved1; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_hosttrap_v1_t; + +/** + * @brief PC Sampling sample data for stochastic sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t perf_snapshot_data; + uint32_t perf_snapshot_data1; + uint32_t perf_snapshot_data2; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_snapshot_v1_t; + +/** + * @brief PC Sampling method kinds + */ +typedef enum { + HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1, + HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1 +} hsa_ven_amd_pcs_method_kind_t; + +/** + * @brief PC Sampling interval unit type + */ +typedef enum { + HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS +} hsa_ven_amd_pcs_units_t; + +/** + * @brief HSA callback function to perform the copy onto a destination buffer + * + * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal + * buffers. Remaining contents of HSA internal buffers will be included in next + * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling + * hsa_ven_amd_pcs_flush. + * + * @param[in] hsa_callback_data private data to pass back to HSA. Provided in + * hsa_ven_amd_pcs_data_ready_callback_t + * + * @param[in] data_size size of destination buffer in bytes. + * @param[in] destination destination buffer + * @retval TBD: but could be used to indicate that there is no more data to be read. + * Or indicate an error and abort of current copy operations + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data, + size_t data_size, void* destination); + +/** + * @brief HSA callback function to to indicate that there is data ready to be copied + * + * When the client receives this callback, the client should call back @p data_copy_callback for HSA + * to perform the copy operation into an available buffer. @p data_copy_callback can be called back + * multiple times with smaller @p data_size to split the copy operation. + * + * This callback must not call ::hsa_ven_amd_pcs_flush. + * + * @param[in] client_callback_data client private data passed in via + * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id + * @param[in] data_size size of data available to be copied + * @param[in] lost_sample_count number of lost samples since last call to + * hsa_ven_amd_pcs_data_ready_callback_t. + * @param[in] data_copy_callback callback function for HSA to perform the actual copy + * @param[in] hsa_callback_data private data to pass back to HSA + */ +typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)( + void* client_callback_data, size_t data_size, size_t lost_sample_count, + hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data); + +/** + * @brief Opaque handle representing a sampling session. + * Two sessions having same handle value represent the same session + */ +typedef struct { + uint64_t handle; +} hsa_ven_amd_pcs_t; + +/** + * @brief PC Sampling configuration flag options + */ +typedef enum { + /* The interval for this sampling method have to be a power of 2 */ + HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0) +} hsa_ven_amd_pcs_configuration_flags_t; + +/** + * @brief PC Sampling method information + * Used to provide client with list of supported PC Sampling methods + */ +typedef struct { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t min_interval; + size_t max_interval; + uint64_t flags; +} hsa_ven_amd_pcs_configuration_t; + +/** + * @brief Callback function to iterate through list of supported PC Sampling configurations + * + * @param[in] configuration one entry for supported PC Sampling method and configuration options + * @param[in] callback_data client private callback data that was passed in when calling + * hsa_ven_amd_pcs_iterate_configuration + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)( + const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data); + +/** + * @brief Iterate through list of current supported PC Sampling configurations for this @p agent + * + * HSA will callback @p configuration_callback for each currently available PC Sampling + * configuration. The list of currently available configurations may not be the complete list of + * configurations supported on the @p agent. The list of currently available configurations may be + * reduced if the @p agent is currently handling other PC sampling sessions. + * + * @param[in] agent target agent + * @param[in] configuration_callback callback function to iterate through list of configurations + * @param[in] callback_data client private callback data + **/ +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + +/** + * @brief Create a PC Sampling session on @p agent + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, as described by the + * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of + * hsa_ven_amd_pcs_iterate_configuration for this @p agent. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + + +/** + * @brief Creates a PC Sampling session on @p agent. Assumes that the caller provides the + * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling + * on the @p agent. + * + * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing + * PC sampling session that was previously created in the underlying driver. + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, and match the parameters that we used to create + * the underlying PC Sampling session in the underlying driver. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +/** + * @brief Free a PC Sampling session on @p agent + * + * Free all the resources allocated for a PC Sampling session on @p agent + * Internal buffers for this session will be lost. + * If the session was active, the session will be stopped before it is destroyed. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Start a PC Sampling session + * + * Activate a PC Sampling session that was previous created. + * The session with be in a active state after this call + * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session started successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Stop a PC Sampling session + * + * Stop a session that is currently active + * After a session is stopped HSA may still have some PC Sampling data in its internal buffers. + * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal + * buffers are not drained and the session is started again, the internal buffers will be available + * on the next data_ready_callback. + * If the session was already inactive, this will result in a no-op and will return + * HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session stopped successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Flush internal buffers for a PC Sampling session + * + * Drain internal buffers for a PC Sampling session. If internal buffers have available data, + * this trigger a data_ready_callback. + * + * The function blocks until all PC samples associated with the @p pc_sampling session + * generated prior to the function call have been communicated by invocations of + * @p data_ready_callback having completed execution. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session flushed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + +#define hsa_ven_amd_pc_sampling_1_00 + +/** + * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by + * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t { + hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + + hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_create_from_id)( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling); + +} hsa_ven_amd_pc_sampling_1_00_pfn_t; + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif /* HSA_VEN_AMD_PC_SAMPLING_H */ diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index 08836a577..ff70e61bd 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -483,6 +483,10 @@ namespace code { *major = 5; *minor = 0; return true; + case ELF::ELFABIVERSION_AMDGPU_HSA_V6: + *major = 6; + *minor = 0; + return true; } return false; @@ -600,6 +604,12 @@ namespace code { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: MI.Name = "gfx1103"; MI.XnackSupported = false; MI.SrameccSupported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: MI.Name = "gfx1150"; MI.XnackSupported = false; MI.SrameccSupported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: MI.Name = "gfx1151"; MI.XnackSupported = false; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: MI.Name = "gfx9-generic"; MI.XnackSupported = true; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: MI.Name = "gfx10-1-generic"; MI.XnackSupported = true; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC: MI.Name = "gfx10-3-generic"; MI.XnackSupported = false; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC: MI.Name = "gfx11-generic"; MI.XnackSupported = false; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: MI.Name = "gfx1200"; MI.XnackSupported = false; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: MI.Name = "gfx1201"; MI.XnackSupported = false; MI.SrameccSupported = false; break; default: return false; } return true; @@ -687,13 +697,17 @@ namespace code { return MI.Name; } - bool AmdHsaCode::GetIsa(std::string& isa_name) + bool AmdHsaCode::GetIsa(std::string& isa_name, unsigned *genericVersion) { isa_name.clear(); uint32_t code_object_major_version = 0; uint32_t code_object_minor_version = 0; + // Generic versioning starts at 1, so zero means no generic version. + if (genericVersion) + *genericVersion = 0; + switch (img->EClass()) { case ELFCLASS64: // There is no e_machine and/or OS ABI for R600 so rely on checking @@ -740,7 +754,7 @@ namespace code { MI.Name += ":xnack+"; else if (MI.XnackSupported) MI.Name += ":xnack-"; - } else if (code_object_major_version == 4 || code_object_major_version == 5) { + } else if (code_object_major_version >= 4) { switch (img->EFlags() & ELF::EF_AMDGPU_FEATURE_SRAMECC_V4) { case ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4: MI.Name += ":sramecc-"; @@ -758,6 +772,12 @@ namespace code { MI.Name += ":xnack+"; break; } + + // Generic version is not part of the ISA name. + // Only parse it when the caller wants it. + if (genericVersion && code_object_major_version >= 6) { + *genericVersion = (img->EFlags() & ELF::EF_AMDGPU_GENERIC_VERSION) >> ELF::EF_AMDGPU_GENERIC_VERSION_OFFSET; + } } else { return false; } @@ -936,7 +956,7 @@ namespace code { uint64_t offset = ImageInitSection()->addData(&desc, sizeof(desc), 8); amd::elf::Symbol* imageInit = img->symtab()->addSymbol(ImageInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL); - image->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_IMAGE, imageInit, image->elfSym()->value() + destOffset, 0); + image->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_V1_INIT_IMAGE, imageInit, image->elfSym()->value() + destOffset, 0); } void AmdHsaCode::AddImageInitializer( @@ -977,7 +997,7 @@ namespace code { uint64_t offset = SamplerInitSection()->addData(&desc, sizeof(desc), 8); amd::elf::Symbol* samplerInit = img->symtab()->addSymbol(SamplerInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL); - sampler->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_SAMPLER, samplerInit, sampler->elfSym()->value() + destOffset, 0); + sampler->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_V1_INIT_SAMPLER, samplerInit, sampler->elfSym()->value() + destOffset, 0); } void AmdHsaCode::AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, @@ -996,7 +1016,7 @@ namespace code { void AmdHsaCode::AddInitVarWithAddress(bool large, Symbol* dest, uint64_t destOffset, Symbol* addrOf, uint64_t addrAddend) { - uint32_t rtype = large ? R_AMDGPU_64 : R_AMDGPU_32_LOW; + uint32_t rtype = large ? R_AMDGPU_V1_64 : R_AMDGPU_V1_32_LOW; dest->elfSym()->section()->relocationSection()->addRelocation(rtype, addrOf->elfSym(), dest->elfSym()->value() + destOffset, addrAddend); } diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index 2a3b96f24..a6ea83c33 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -81,8 +81,10 @@ __attribute__((noinline)) static void _loader_debug_state() { // 6: New trap handler ABI. ttmp6[25:0] contains dispatch index modulo queue size // 7: New trap handler ABI. Send interrupts as a bitmask, coalescing concurrent exceptions. // 8: New trap handler ABI. for gfx940: Initialize ttmp[4:5] if ttmp11[31] == 0. -// 9: New trap handler API. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped. -HSA_API r_debug _amdgpu_r_debug = {9, +// 9: New trap handler ABI. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped. +// 10: New trap handler ABI. Set status.skip_export when halting the wave. +// For gfx940, set ttmp6[31] = 0 if ttmp11[31] == 0. +HSA_API r_debug _amdgpu_r_debug = {10, nullptr, reinterpret_cast(&_loader_debug_state), r_debug::RT_CONSISTENT, @@ -186,6 +188,18 @@ Executable* AmdHsaCodeLoader::CreateExecutable( return executables.back(); } +Executable* AmdHsaCodeLoader::CreateExecutable( + std::unique_ptr isolated_context, + hsa_profile_t profile, + const char *options, + hsa_default_float_rounding_mode_t default_float_rounding_mode) +{ + WriterLockGuard writer_lock(rw_lock_); + + executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode)); + return executables.back(); +} + static void AddCodeObjectInfoIntoDebugMap(link_map* map) { if (r_debug_tail) { r_debug_tail->l_next = map; @@ -522,6 +536,10 @@ bool KernelSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { *((bool*)value) = is_dynamic_callstack; break; } + case HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE: { + *((uint32_t*)value) = wavefront_size; + break; + } case HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE: { *((uint32_t*)value) = size; break; @@ -734,6 +752,22 @@ ExecutableImpl::ExecutableImpl( { } +ExecutableImpl::ExecutableImpl( + const hsa_profile_t &_profile, + std::unique_ptr unique_context, + size_t id, + hsa_default_float_rounding_mode_t default_float_rounding_mode) + : Executable() + , profile_(_profile) + , unique_context_(std::move(unique_context)) + , id_(id) + , default_float_rounding_mode_(default_float_rounding_mode) + , state_(HSA_EXECUTABLE_STATE_UNFROZEN) + , program_allocation_segment(nullptr) +{ + context_ = unique_context_.get(); +} + ExecutableImpl::~ExecutableImpl() { for (ExecutableObject* o : objects) { o->Destroy(); @@ -1212,7 +1246,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject( } std::string codeIsa; - if (!code->GetIsa(codeIsa)) { + unsigned genericVersion; + if (!code->GetIsa(codeIsa, &genericVersion)) { logger_ << "LoaderError: failed to determine code object's ISA\n"; return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -1223,7 +1258,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - if (majorVersion < 1 || majorVersion > 5) { + if (majorVersion < 1 || majorVersion > 6) { logger_ << "LoaderError: unsupported code object version: " << majorVersion << "\n"; return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -1251,7 +1286,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( return HSA_STATUS_ERROR_INVALID_ISA_NAME; } - if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa)) { + if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa, genericVersion)) { logger_ << "LoaderError: code object's ISA (" << codeIsa.c_str() << ") is not supported by the agent\n"; return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; } @@ -1434,6 +1469,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, uint32_t group_segment_size = kd.group_segment_fixed_size; uint32_t private_segment_size = kd.private_segment_fixed_size; bool is_dynamic_callstack = AMDHSA_BITS_GET(kd.kernel_code_properties, rocr::llvm::amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK); + bool uses_wave32 = AMDHSA_BITS_GET( kd.kernel_code_properties, rocr::llvm::amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); uint64_t size = sym->Size(); @@ -1449,6 +1485,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, is_dynamic_callstack, size, 64, + uses_wave32 ? 32 : 64, address); symbol = kernel_symbol; } else if (sym->IsVariableSymbol()) { @@ -1478,6 +1515,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, uint32_t(akc.workitem_private_segment_byte_size); bool is_dynamic_callstack = AMD_HSA_BITS_GET(akc.kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK) ? true : false; + bool uses_wave32 = akc.wavefront_size == AMD_POWERTWO_32; uint64_t size = sym->Size(); @@ -1498,6 +1536,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, is_dynamic_callstack, size, 256, + uses_wave32 ? 32 : 64, address); kernel_symbol->debug_info.elf_raw = code->ElfData(); kernel_symbol->debug_info.elf_size = code->ElfSize(); @@ -1585,15 +1624,24 @@ Segment* ExecutableImpl::SectionSegment(hsa_agent_t agent, code::Section* sec) hsa_status_t ExecutableImpl::ApplyRelocations(hsa_agent_t agent, amd::hsa::code::AmdHsaCode *c) { hsa_status_t status = HSA_STATUS_SUCCESS; + + uint32_t majorVersion, minorVersion; + if (!c->GetCodeObjectVersion(&majorVersion, &minorVersion)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + for (size_t i = 0; i < c->RelocationSectionCount(); ++i) { if (c->GetRelocationSection(i)->targetSection()) { + // Static relocations may be present if --emit-relocs + // option was passed to lld, but they cannot be applied + // again, so skip it for code object v2 and up. + if (majorVersion >= 2) { + continue; + } + status = ApplyStaticRelocationSection(agent, c->GetRelocationSection(i)); } else { // Dynamic relocations are supported starting code object v2.1. - uint32_t majorVersion, minorVersion; - if (!c->GetCodeObjectVersion(&majorVersion, &minorVersion)) { - return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; - } if (majorVersion < 2) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -1628,9 +1676,9 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa:: Segment* rseg = SectionSegment(agent, sec); size_t reladdr = sec->addr() + rel->offset(); switch (rel->type()) { - case R_AMDGPU_32_LOW: - case R_AMDGPU_32_HIGH: - case R_AMDGPU_64: + case R_AMDGPU_V1_32_LOW: + case R_AMDGPU_V1_32_HIGH: + case R_AMDGPU_V1_64: { uint64_t addr; switch (sym->type()) { @@ -1661,15 +1709,15 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa:: uint32_t addr32 = 0; switch (rel->type()) { - case R_AMDGPU_32_HIGH: + case R_AMDGPU_V1_32_HIGH: addr32 = uint32_t((addr >> 32) & 0xFFFFFFFF); rseg->Copy(reladdr, &addr32, sizeof(addr32)); break; - case R_AMDGPU_32_LOW: + case R_AMDGPU_V1_32_LOW: addr32 = uint32_t(addr & 0xFFFFFFFF); rseg->Copy(reladdr, &addr32, sizeof(addr32)); break; - case R_AMDGPU_64: + case R_AMDGPU_V1_64: rseg->Copy(reladdr, &addr, sizeof(addr)); break; default: @@ -1678,7 +1726,7 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa:: break; } - case R_AMDGPU_INIT_SAMPLER: + case R_AMDGPU_V1_INIT_SAMPLER: { if (STT_AMDGPU_HSA_METADATA != sym->type() || SHT_PROGBITS != sym->section()->type() || @@ -1709,7 +1757,7 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa:: break; } - case R_AMDGPU_INIT_IMAGE: + case R_AMDGPU_V1_INIT_IMAGE: { if (STT_AMDGPU_HSA_METADATA != sym->type() || SHT_PROGBITS != sym->section()->type() || @@ -1822,7 +1870,7 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa: symAddr += rel->addend(); switch (rel->type()) { - case R_AMDGPU_32_HIGH: + case ELF::R_AMDGPU_ABS32_HI: { if (!symAddr) { logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; @@ -1834,7 +1882,7 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa: break; } - case R_AMDGPU_32_LOW: + case ELF::R_AMDGPU_ABS32_LO: { if (!symAddr) { logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; @@ -1846,7 +1894,19 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa: break; } - case R_AMDGPU_64: + case ELF::R_AMDGPU_ABS32: + { + if (!symAddr) { + logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + + uint32_t symAddr32 = uint32_t(symAddr); + relSeg->Copy(rel->offset(), &symAddr32, sizeof(symAddr32)); + break; + } + + case ELF::R_AMDGPU_ABS64: { if (!symAddr) { logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; @@ -1857,7 +1917,7 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa: break; } - case R_AMDGPU_RELATIVE64: + case ELF::R_AMDGPU_RELATIVE64: { int64_t baseDelta = reinterpret_cast(relSeg->Address(0)) - relSeg->VAddr(); uint64_t relocatedAddr = baseDelta + rel->addend(); diff --git a/src/loader/executable.hpp b/src/loader/executable.hpp index c360df348..9429ff948 100644 --- a/src/loader/executable.hpp +++ b/src/loader/executable.hpp @@ -144,6 +144,7 @@ class KernelSymbol final: public SymbolImpl { const bool &_is_dynamic_callstack, const uint32_t &_size, const uint32_t &_alignment, + const uint32_t &_wavefront_size, const uint64_t &_address = 0) : SymbolImpl(_is_loaded, HSA_SYMBOL_KIND_KERNEL, @@ -159,7 +160,8 @@ class KernelSymbol final: public SymbolImpl { , private_segment_size(_private_segment_size) , is_dynamic_callstack(_is_dynamic_callstack) , size(_size) - , alignment(_alignment) {} + , alignment(_alignment) + , wavefront_size(_wavefront_size) {} ~KernelSymbol() {} @@ -173,6 +175,7 @@ class KernelSymbol final: public SymbolImpl { bool is_dynamic_callstack; uint32_t size; uint32_t alignment; + uint32_t wavefront_size; amd_runtime_loader_debug_info_t debug_info; private: @@ -420,6 +423,12 @@ friend class AmdHsaCodeLoader; size_t id, hsa_default_float_rounding_mode_t default_float_rounding_mode); + ExecutableImpl( + const hsa_profile_t &_profile, + std::unique_ptr unique_context, + size_t id, + hsa_default_float_rounding_mode_t default_float_rounding_mode); + ~ExecutableImpl(); hsa_status_t GetInfo(hsa_executable_info_t executable_info, void *value) override; @@ -546,6 +555,7 @@ friend class AmdHsaCodeLoader; amd::hsa::common::ReaderWriterLock rw_lock_; hsa_profile_t profile_; Context *context_; + std::unique_ptr unique_context_; Logger logger_; const size_t id_; hsa_default_float_rounding_mode_t default_float_rounding_mode_; @@ -575,6 +585,12 @@ class AmdHsaCodeLoader : public Loader { const char *options, hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) override; + Executable* CreateExecutable( + std::unique_ptr isolated_context, + hsa_profile_t profile, + const char *options, + hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) override; + hsa_status_t FreezeExecutable(Executable *executable, const char *options) override; void DestroyExecutable(Executable *executable) override; diff --git a/src/pcs/hsa_ven_amd_pc_sampling.cpp b/src/pcs/hsa_ven_amd_pc_sampling.cpp new file mode 100644 index 000000000..f57d7353c --- /dev/null +++ b/src/pcs/hsa_ven_amd_pc_sampling.cpp @@ -0,0 +1,166 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "pcs_runtime.h" +#include "core/inc/agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/exceptions.h" + +namespace rocr { +namespace AMD { +hsa_status_t handleException(); + +template static __forceinline T handleExceptionT() { + handleException(); + abort(); + return T(); +} +} // namespace AMD + +#define IS_OPEN() \ + do { \ + if (!core::Runtime::runtime_singleton_->IsOpen()) return HSA_STATUS_ERROR_NOT_INITIALIZED; \ + } while (false) + +template static __forceinline bool IsValid(T* ptr) { + return (ptr == NULL) ? NULL : ptr->IsValid(); +} + +#define TRY try { +#define CATCH \ + } \ + catch (...) { \ + return AMD::handleException(); \ + } +#define CATCHRET(RETURN_TYPE) \ + } \ + catch (...) { \ + return AMD::handleExceptionT(); \ + } + +namespace pcs { + +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t hsa_agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data) { + TRY; + IS_OPEN(); + + core::Agent* agent = core::Agent::Convert(hsa_agent); + if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice) + return HSA_STATUS_ERROR_INVALID_AGENT; + + return PcsRuntime::instance()->PcSamplingIterateConfig(agent, configuration_callback, + callback_data); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t hsa_agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + TRY; + IS_OPEN(); + core::Agent* agent = core::Agent::Convert(hsa_agent); + if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice) + return HSA_STATUS_ERROR_INVALID_AGENT; + + return PcsRuntime::instance()->PcSamplingCreate( + agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_create_from_id(uint32_t pcs_id, hsa_agent_t hsa_agent, + hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + TRY; + IS_OPEN(); + core::Agent* agent = core::Agent::Convert(hsa_agent); + if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice) + return HSA_STATUS_ERROR_INVALID_AGENT; + + return PcsRuntime::instance()->PcSamplingCreateFromId(pcs_id, agent, method, units, interval, + latency, buffer_size, data_ready_cb, + client_cb_data, handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingDestroy(handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingStart(handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingStop(handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingFlush(handle); + CATCH; +} + +void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api) { + pcs_api->hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ven_amd_pcs_iterate_configuration; + pcs_api->hsa_ven_amd_pcs_create_fn = hsa_ven_amd_pcs_create; + pcs_api->hsa_ven_amd_pcs_create_from_id_fn = hsa_ven_amd_pcs_create_from_id; + pcs_api->hsa_ven_amd_pcs_destroy_fn = hsa_ven_amd_pcs_destroy; + pcs_api->hsa_ven_amd_pcs_start_fn = hsa_ven_amd_pcs_start; + pcs_api->hsa_ven_amd_pcs_stop_fn = hsa_ven_amd_pcs_stop; + pcs_api->hsa_ven_amd_pcs_flush_fn = hsa_ven_amd_pcs_flush; +} + +} // namespace pcs +} // namespace rocr diff --git a/src/pcs/inc/hsa_ven_amd_pc_sampling_impl.h b/src/pcs/inc/hsa_ven_amd_pc_sampling_impl.h new file mode 100644 index 000000000..58ed4d437 --- /dev/null +++ b/src/pcs/inc/hsa_ven_amd_pc_sampling_impl.h @@ -0,0 +1,91 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_VEN_AMD_PC_SAMPLING_IMPL_H +#define HSA_VEN_AMD_PC_SAMPLING_IMPL_H + +#include "inc/hsa.h" +#include "inc/hsa_ext_amd.h" +#include "inc/hsa_ven_amd_pc_sampling.h" +#include "core/inc/hsa_ext_interface.h" + +//---------------------------------------------------------------------------// +// APIs that implement PC Sampling functionality +//---------------------------------------------------------------------------// + +namespace rocr { +namespace pcs { + +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + +// Update Api table with func pointers that implement functionality +void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api); + +// Release resources acquired by Image implementation +void ReleasePcSamplingRsrcs(); + +} // namespace pcs +} // namespace rocr + +#endif // HSA_VEN_AMD_PC_SAMPLING_IMPL_H diff --git a/src/pcs/pcs_runtime.cpp b/src/pcs/pcs_runtime.cpp new file mode 100644 index 000000000..9d453bb31 --- /dev/null +++ b/src/pcs/pcs_runtime.cpp @@ -0,0 +1,355 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "pcs_runtime.h" + +#include +#include + +#include "core/inc/runtime.h" + +#include "core/inc/amd_gpu_agent.h" + +namespace rocr { +namespace pcs { + +#define IS_BAD_PTR(ptr) \ +do { \ + if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \ +} while (false) + +std::atomic PcsRuntime::instance_(NULL); +std::mutex PcsRuntime::instance_mutex_; + +PcsRuntime* PcsRuntime::instance() { + PcsRuntime* instance = instance_.load(std::memory_order_acquire); + if (instance == NULL) { + // Protect the initialization from multi threaded access. + std::lock_guard lock(instance_mutex_); + + // Make sure we are not initializing it twice. + instance = instance_.load(std::memory_order_relaxed); + if (instance != NULL) { + return instance; + } + + instance = CreateSingleton(); + if (instance == NULL) { + return NULL; + } + } + + return instance; +} + +PcsRuntime* PcsRuntime::CreateSingleton() { + PcsRuntime* instance = new PcsRuntime(); + + instance_.store(instance, std::memory_order_release); + return instance; +} + +void PcsRuntime::DestroySingleton() { + PcsRuntime* instance = instance_.load(std::memory_order_acquire); + if (instance == NULL) { + return; + } + + instance_.store(NULL, std::memory_order_release); + delete instance; +} + +void ReleasePcSamplingRsrcs() { PcsRuntime::DestroySingleton(); } + +bool PcsRuntime::SessionsActive() const { + return pc_sampling_.size() > 0; +} + +PcsRuntime::PcSamplingSession::PcSamplingSession( + core::Agent* _agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, + size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data) + : agent(_agent), thunkId_(0), active_(false), valid_(true), sample_size_(0) { + switch (method) { + case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: + sample_size_ = sizeof(perf_sample_hosttrap_v1_t); + break; + case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1: + sample_size_ = sizeof(perf_sample_snapshot_v1_t); + break; + default: + valid_ = false; + return; + } + + if (!interval || !buffer_size || (buffer_size % (2 * sample_size_))) { + valid_ = false; + return; + } + + csd.method = method; + csd.units = units; + csd.interval = interval; + csd.latency = latency; + csd.buffer_size = buffer_size; + csd.data_ready_callback = data_ready_callback; + csd.client_callback_data = client_callback_data; +} + +void PcsRuntime::PcSamplingSession::GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo) { + sampleInfo->value_min = 0; + sampleInfo->value_max = 0; + sampleInfo->flags = 0; + sampleInfo->value = csd.interval; + + switch (csd.method) { + case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: + sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1; + break; + case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1: + sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1; + break; + } + + switch (csd.units) { + case HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS: + sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS; + break; + case HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES: + sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES; + break; + case HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS: + sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS; + break; + } +} + +hsa_status_t PcSamplingDataCopyCallback(void* _session, size_t bytes_to_copy, void* destination) { + assert(_session); + assert(destination); + + PcsRuntime::PcSamplingSession* session = + reinterpret_cast(_session); + + return session->DataCopyCallback(reinterpret_cast(destination), bytes_to_copy); +} + +hsa_status_t PcsRuntime::PcSamplingSession::DataCopyCallback(uint8_t* buffer, + size_t bytes_to_copy) { + if (bytes_to_copy != (data_rdy.buf1_sz + data_rdy.buf2_sz)) return HSA_STATUS_ERROR_EXCEPTION; + + if (data_rdy.buf1_sz) memcpy(buffer, data_rdy.buf1, data_rdy.buf1_sz); + if (data_rdy.buf2_sz) memcpy(buffer + data_rdy.buf1_sz, data_rdy.buf2, data_rdy.buf2_sz); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t PcsRuntime::PcSamplingSession::HandleSampleData(uint8_t* buf1, size_t buf1_sz, + uint8_t* buf2, size_t buf2_sz, + size_t lost_sample_count) { + data_rdy.buf1 = buf1; + data_rdy.buf1_sz = buf1_sz; + data_rdy.buf2 = buf2; + data_rdy.buf2_sz = buf2_sz; + + AMD::GpuAgent* gpuAgent = static_cast(agent); + + switch (csd.method) { + case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: { + size_t buf_samples = buf1_sz / sizeof(perf_sample_hosttrap_v1_t); + perf_sample_hosttrap_v1_t* samples = reinterpret_cast(buf1); + while (buf_samples--) { + samples->timestamp = gpuAgent->TranslateTime(samples->timestamp); + samples++; + } + + buf_samples = buf2_sz / sizeof(perf_sample_hosttrap_v1_t); + samples = reinterpret_cast(buf2); + while (buf_samples--) { + samples->timestamp = gpuAgent->TranslateTime(samples->timestamp); + samples++; + } + } + break; + case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1: { + size_t buf_samples = buf1_sz / sizeof(perf_sample_snapshot_v1_t); + perf_sample_snapshot_v1_t* samples = reinterpret_cast(buf1); + while (buf_samples--) { + samples->timestamp = gpuAgent->TranslateTime(samples->timestamp); + samples++; + } + + buf_samples = buf2_sz / sizeof(perf_sample_snapshot_v1_t); + samples = reinterpret_cast(buf2); + while (buf_samples--) { + samples->timestamp = gpuAgent->TranslateTime(samples->timestamp); + samples++; + } + } + break; + } + + csd.data_ready_callback(csd.client_callback_data, buf1_sz + buf2_sz, lost_sample_count, + &PcSamplingDataCopyCallback, + /* hsa_callback_data*/ this); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t PcsRuntime::PcSamplingIterateConfig( + core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data) { + AMD::GpuAgentInt* gpu_agent = static_cast(agent); + return gpu_agent->PcSamplingIterateConfig(configuration_callback, callback_data); +} + +hsa_status_t PcsRuntime::PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + + IS_BAD_PTR(handle); + IS_BAD_PTR(data_ready_cb); + + return PcSamplingCreateInternal( + agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle, + [](core::Agent* agent_, PcSamplingSession& session_) { + return static_cast(agent_)->PcSamplingCreate(session_); + }); +} + +hsa_status_t PcsRuntime::PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent, + hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + IS_BAD_PTR(handle); + IS_BAD_PTR(data_ready_cb); + + return PcSamplingCreateInternal( + agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle, + [&](core::Agent* agent_, PcSamplingSession& session_) { + return static_cast(agent_)->PcSamplingCreateFromId(ioctl_pcs_id, + session_); + }); +} + +hsa_status_t PcsRuntime::PcSamplingCreateInternal( + core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, + size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data, + hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) { + ScopedAcquire lock(&pc_sampling_lock_); + + handle->handle = ++pc_sampling_id_; + // create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size, + // data_ready_cb, client_cb_data) reference and insert into pc_sampling_ + pc_sampling_.emplace(std::piecewise_construct, std::forward_as_tuple(handle->handle), + std::forward_as_tuple(agent, method, units, interval, latency, buffer_size, + data_ready_cb, client_cb_data)); + + if (!pc_sampling_[handle->handle].isValid()) { + pc_sampling_.erase(handle->handle); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t ret = agent_pcs_create_fn(agent, pc_sampling_[handle->handle]); + if (ret != HSA_STATUS_SUCCESS) { + pc_sampling_.erase(handle->handle); + return ret; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + hsa_status_t ret = gpu_agent->PcSamplingDestroy(pcSamplingSessionIt->second); + pc_sampling_.erase(pcSamplingSessionIt); + return ret; +} + +hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + return gpu_agent->PcSamplingStart(pcSamplingSessionIt->second); +} + +hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + return gpu_agent->PcSamplingStop(pcSamplingSessionIt->second); +} + +hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + return gpu_agent->PcSamplingFlush(pcSamplingSessionIt->second); +} + +} // namespace pcs +} // namespace rocr diff --git a/src/pcs/pcs_runtime.h b/src/pcs/pcs_runtime.h new file mode 100644 index 000000000..6fa489c73 --- /dev/null +++ b/src/pcs/pcs_runtime.h @@ -0,0 +1,176 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_PCS_RUNTIME_H +#define HSA_RUNTIME_PCS_RUNTIME_H + +#include +#include +#include + +#include "hsakmt/hsakmt.h" + +#include "hsa_ven_amd_pc_sampling.h" +#include "core/inc/agent.h" +#include "core/inc/exceptions.h" + + +namespace rocr { +namespace pcs { + +class PcsRuntime { + public: + PcsRuntime() : pc_sampling_id_(0) {} + ~PcsRuntime() {} + + /// @brief Getter for the PcsRuntime singleton object. + static PcsRuntime* instance(); + + bool SessionsActive() const; + + /// @brief Destroy singleton object. + static void DestroySingleton(); + + class PcSamplingSession { + public: + PcSamplingSession() : agent(NULL), thunkId_(0), active_(false){}; + PcSamplingSession(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data); + ~PcSamplingSession(){}; + + const bool isValid() { return valid_; } + const size_t buffer_size() { return csd.buffer_size; } + const hsa_ven_amd_pcs_method_kind_t method() { return csd.method; } + const size_t latency() { return csd.latency; } + const size_t sample_size() { return sample_size_; } + + void GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo); + hsa_status_t HandleSampleData(uint8_t* buf1, size_t buf1_sz, uint8_t* buf2, size_t buf2_sz, + size_t lost_sample_count); + hsa_status_t DataCopyCallback(uint8_t* buffer, size_t buffer_size); + + core::Agent* agent; + void SetThunkId(HsaPcSamplingTraceId thunkId) { thunkId_ = thunkId; } + HsaPcSamplingTraceId ThunkId() { return thunkId_; } + bool isActive() { return active_; } + void start() { active_ = true; } + void stop() { active_ = false; } + + private: + HsaPcSamplingTraceId thunkId_; + + bool active_; // Set to true when the session is started + bool valid_; // Whether configuration parameters are valid + size_t sample_size_; + + struct client_session_data_t { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t interval; + size_t latency; + size_t buffer_size; + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback; + void* client_callback_data; + }; + struct client_session_data_t csd; + + struct data_ready_info_t { + uint8_t* buf1; + size_t buf1_sz; + uint8_t* buf2; + size_t buf2_sz; + }; + struct data_ready_info_t data_rdy; + }; // class PcSamplingSession + + hsa_status_t PcSamplingIterateConfig( + core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + + hsa_status_t PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle); + + + hsa_status_t PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent, + hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle); + + hsa_status_t PcSamplingDestroy(hsa_ven_amd_pcs_t handle); + hsa_status_t PcSamplingStart(hsa_ven_amd_pcs_t handle); + hsa_status_t PcSamplingStop(hsa_ven_amd_pcs_t handle); + hsa_status_t PcSamplingFlush(hsa_ven_amd_pcs_t handle); + + private: + /// @brief Initialize singleton object, must be called once. + static PcsRuntime* CreateSingleton(); + + /// Pointer to singleton object. + static std::atomic instance_; + static std::mutex instance_mutex_; + + // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle + std::map pc_sampling_; + KernelMutex pc_sampling_lock_; + uint64_t pc_sampling_id_; + + DISALLOW_COPY_AND_ASSIGN(PcsRuntime); + + using agent_pcs_create_fn_t = std::function; + hsa_status_t PcSamplingCreateInternal(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle, + agent_pcs_create_fn_t agent_pcs_create_fn); +}; + +} // namespace pcs +} // namespace rocr +#endif // HSA_RUNTIME_PCS_RUNTIME_H