diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 321d6e98a..ea4f39820 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -65,6 +65,7 @@ include ( GNUInstallDirs )
 if ( NOT DEFINED BUILD_SHARED_LIBS )
   set ( BUILD_SHARED_LIBS ON )
 endif()
+
 set ( BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS} CACHE BOOL "Build shared library (.so) or not.")
 
 ## Adjust target name for static builds
@@ -85,7 +86,7 @@ if (ROCM_CCACHE_BUILD)
 endif() # if (ROCM_CCACHE_BUILD)
 
 ## Get version strings
-get_version ( "1.13.0" )
+get_version ( "1.14.0" )
 if ( ${ROCM_PATCH_VERSION} )
   set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
 endif()
@@ -158,7 +159,8 @@ set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY LINK_FLAGS ${HSA_SHARED_LINK
 ##  -------------------------  End Compiler and Linker options ----------------------------
 
 ## Source files.
-set ( SRCS core/util/lnx/os_linux.cpp
+set ( SRCS core/driver/driver.cpp
+           core/util/lnx/os_linux.cpp
            core/util/small_heap.cpp
            core/util/timer.cpp
            core/util/flag.cpp
@@ -208,6 +210,16 @@ add_dependencies( ${CORE_RUNTIME_TARGET} amd_trap_handler_v2 )
 add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/blit_shaders )
 add_dependencies( ${CORE_RUNTIME_TARGET} amd_blit_shaders_v2)
 
+option(PC_SAMPLING_SUPPORT "Enable PC Sampling Support" ON)
+
+if (${PC_SAMPLING_SUPPORT})
+  target_compile_definitions(${CORE_RUNTIME_TARGET} PRIVATE HSA_PC_SAMPLING_SUPPORT)
+
+  set( PCS_SRCS pcs/hsa_ven_amd_pc_sampling.cpp pcs/pcs_runtime.cpp )
+
+  target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${PCS_SRCS} )
+endif()
+
 if ( NOT DEFINED IMAGE_SUPPORT AND CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64|loongarch64" )
   set ( IMAGE_SUPPORT ON )
 endif()
@@ -228,6 +240,7 @@ if(${IMAGE_SUPPORT})
                    image/addrlib/src/core/addrlib.cpp
                    image/addrlib/src/core/addrlib1.cpp
                    image/addrlib/src/core/addrlib2.cpp
+                   image/addrlib/src/core/addrlib3.cpp
                    image/addrlib/src/core/addrobject.cpp
                    image/addrlib/src/core/addrelemlib.cpp
                    image/addrlib/src/r800/ciaddrlib.cpp
@@ -236,6 +249,7 @@ if(${IMAGE_SUPPORT})
                    image/addrlib/src/gfx9/gfx9addrlib.cpp
                    image/addrlib/src/gfx10/gfx10addrlib.cpp
                    image/addrlib/src/gfx11/gfx11addrlib.cpp
+                   image/addrlib/src/gfx12/gfx12addrlib.cpp
                    image/device_info.cpp
                    image/hsa_ext_image.cpp
                    image/image_runtime.cpp
@@ -244,6 +258,7 @@ if(${IMAGE_SUPPORT})
                    image/image_manager_ai.cpp
                    image/image_manager_nv.cpp
                    image/image_manager_gfx11.cpp
+                   image/image_manager_gfx12.cpp
                    image/image_lut_kv.cpp
                    image/image_lut_gfx11.cpp
                    image/blit_object_gfx7xx.cpp
@@ -265,10 +280,12 @@ if(${IMAGE_SUPPORT})
     ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx9
     ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx10
     ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx11
+    ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/gfx12
     ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/r800
     ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx9
     ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx10
-    ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx11 )
+    ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx11
+    ${CMAKE_CURRENT_SOURCE_DIR}/image/addrlib/src/chip/gfx12 )
 
   target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${IMAGE_SRCS} )
 
@@ -278,21 +295,27 @@ if(${IMAGE_SUPPORT})
 
 endif()
 
-## Link dependencies.
-target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt::hsakmt PkgConfig::drm)
 target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE elf::elf dl pthread rt )
-
-find_package(rocprofiler-register)
-if(rocprofiler-register_FOUND)
+# For static package rocprofiler-register dependency is not required
+# Link to hsakmt target for shared library builds
+# Link to hsakmt-staticdrm target for static library builds
+if( BUILD_SHARED_LIBS )
+  target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt::hsakmt PkgConfig::drm)
+  find_package(rocprofiler-register)
+  if(rocprofiler-register_FOUND)
     target_compile_definitions(${CORE_RUNTIME_TARGET} PRIVATE HSA_ROCPROFILER_REGISTER=1
                                                               HSA_VERSION_MAJOR=${VERSION_MAJOR}
                                                               HSA_VERSION_MINOR=${VERSION_MINOR}
                                                               HSA_VERSION_PATCH=${VERSION_PATCH})
     target_link_libraries(${CORE_RUNTIME_TARGET} PRIVATE rocprofiler-register::rocprofiler-register)
     set(HSA_DEP_ROCPROFILER_REGISTER ON)
-else()
+  else()
     set(HSA_DEP_ROCPROFILER_REGISTER OFF)
-endif()
+  endif() # end rocprofiler-register_FOUND
+else()
+  include_directories(${drm_INCLUDE_DIRS})
+  target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt-staticdrm::hsakmt-staticdrm)
+endif()#end BUILD_SHARED_LIBS
 
 ## Set the VERSION and SOVERSION values
 set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY VERSION "${SO_VERSION_STRING}" )
@@ -312,7 +335,7 @@ if( NOT ${BUILD_SHARED_LIBS} )
   add_dependencies( ${CORE_RUNTIME_NAME} ${CORE_RUNTIME_TARGET} )
 
   ## Add external link requirements.
-  target_link_libraries ( ${CORE_RUNTIME_NAME} INTERFACE hsakmt::hsakmt )
+  target_link_libraries ( ${CORE_RUNTIME_NAME} INTERFACE hsakmt-staticdrm::hsakmt-staticdrm )
   target_link_libraries ( ${CORE_RUNTIME_NAME} INTERFACE elf::elf dl pthread rt )
 
   install ( TARGETS ${CORE_RUNTIME_NAME} EXPORT ${CORE_RUNTIME_NAME}Targets )
@@ -417,7 +440,6 @@ endif()
 ## Packaging directives
 set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build")
 set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
-
 ## Only pack the "binary" and "dev" components, post install script will add the directory link.
 set ( CPACK_COMPONENTS_ALL binary dev )
 # ASAN Package will have libraries and license file
@@ -518,5 +540,22 @@ endif()
 set ( CPACK_RPM_PACKAGE_PROVIDES "hsa-ext-rocr-dev hsa-rocr-dev" )
 set ( CPACK_RPM_PACKAGE_OBSOLETES "hsa-ext-rocr-dev" )
 
+if( NOT BUILD_SHARED_LIBS )
+  # Suffix package name with static
+  set ( CPACK_RPM_STATIC_PACKAGE_NAME "hsa-rocr-static-devel")
+  set ( CPACK_DEBIAN_STATIC_PACKAGE_NAME "hsa-rocr-static-dev")
+  set ( CPACK_COMPONENT_STATIC_DESCRIPTION "HSA (Heterogenous System Architecture) core runtime - Linux static libraries" )
+  set ( CPACK_RPM_STATIC_PACKAGE_REQUIRES "${CPACK_RPM_BINARY_PACKAGE_REQUIRES}" )
+  string ( APPEND CPACK_RPM_STATIC_PACKAGE_REQUIRES " hsakmt-roct-devel" )
+  set ( CPACK_DEBIAN_STATIC_PACKAGE_DEPENDS "${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS}" )
+  string ( APPEND CPACK_DEBIAN_STATIC_PACKAGE_DEPENDS ", hsakmt-roct-dev" )
+endif()
 ## Include packaging
 include ( CPack )
+# static package generation
+# Group binary and dev component to single package
+if( NOT BUILD_SHARED_LIBS )
+    cpack_add_component_group("static")
+    cpack_add_component( binary  GROUP static )
+    cpack_add_component( dev GROUP static )
+endif()
diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp
index 58b35f402..31196835c 100644
--- a/src/core/common/hsa_table_interface.cpp
+++ b/src/core/common/hsa_table_interface.cpp
@@ -1238,6 +1238,11 @@ hsa_status_t HSA_API hsa_amd_vmem_address_reserve(void** ptr, size_t size, uint6
   return amdExtTable->hsa_amd_vmem_address_reserve_fn(ptr, size, address, flags);
 }
 
+hsa_status_t HSA_API hsa_amd_vmem_address_reserve_align(void** ptr, size_t size, uint64_t address,
+                                                  uint64_t alignment, uint64_t flags) {
+  return amdExtTable->hsa_amd_vmem_address_reserve_align_fn(ptr, size, address, alignment, flags);
+}
+
 hsa_status_t HSA_API hsa_amd_vmem_address_free(void* ptr, size_t size) {
   return amdExtTable->hsa_amd_vmem_address_free_fn(ptr, size);
 }
@@ -1298,6 +1303,11 @@ hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, si
   return amdExtTable->hsa_amd_agent_set_async_scratch_limit_fn(agent, threshold);
 }
 
+hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* queue,
+                                            hsa_queue_info_attribute_t attribute, void* value) {
+  return amdExtTable->hsa_amd_queue_get_info_fn(queue, attribute, value);
+}
+
 // Tools only table interfaces.
 namespace rocr {
 
diff --git a/src/core/driver/driver.cpp b/src/core/driver/driver.cpp
new file mode 100644
index 000000000..02407e6b2
--- /dev/null
+++ b/src/core/driver/driver.cpp
@@ -0,0 +1,79 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "core/inc/driver.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "inc/hsa.h"
+
+namespace rocr {
+namespace core {
+
+Driver::Driver(const std::string devnode_name, Agent::DeviceType agent_device_type)
+  : agent_device_type_(agent_device_type), devnode_name_(devnode_name) { }
+
+hsa_status_t Driver::Open()
+{
+  fd_  = open(devnode_name_.c_str(), O_RDWR | O_CLOEXEC);
+  if (fd_ < 0) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t Driver::Close()
+{
+  int ret(0);
+  if (fd_ > 0) {
+    ret = close(fd_);
+    fd_ = -1;
+  }
+  if (ret) {
+    return HSA_STATUS_ERROR;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+} // namespace core
+} // namespace rocr
diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h
index ddb8671c2..09f14f941 100644
--- a/src/core/inc/amd_aql_queue.h
+++ b/src/core/inc/amd_aql_queue.h
@@ -196,8 +196,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   /// @return hsa_status_t
   hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override;
 
-  /// @brief Submits a block of PM4 and waits until it has been executed.
-  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;
+  // @brief Submits a block of PM4 and waits until it has been executed.
+  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b,
+                  hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE,
+                  hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
+                  hsa_signal_t* signal = NULL) override;
 
   /// @brief Enables/Disables profiling overrides SetProfiling from core::Queue
   void SetProfiling(bool enabled) override;
@@ -208,6 +211,9 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   /// @brief Update signal value using Release semantics
   void StoreRelease(hsa_signal_value_t value) override;
 
+  /// @brief Provide information about the queue
+  hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override;
+
   /// @brief Enable use of GWS from this queue.
   hsa_status_t EnableGWS(int gws_slot_count);
 
@@ -246,9 +252,11 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   void FillBufRsrcWord3();
   void FillBufRsrcWord3_Gfx10();
   void FillBufRsrcWord3_Gfx11();
+  void FillBufRsrcWord3_Gfx12();
   void FillComputeTmpRingSize();
   void FillAltComputeTmpRingSize();
   void FillComputeTmpRingSize_Gfx11();
+  void FillComputeTmpRingSize_Gfx12();
 
   void FreeMainScratchSpace();
   void FreeAltScratchSpace();
diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
index b8a5a41f5..7b47fa01a 100644
--- a/src/core/inc/amd_gpu_agent.h
+++ b/src/core/inc/amd_gpu_agent.h
@@ -46,6 +46,7 @@
 #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
 
 #include <vector>
+#include <list>
 #include <map>
 
 #include "hsakmt/hsakmt.h"
@@ -59,6 +60,7 @@
 #include "core/util/small_heap.h"
 #include "core/util/locks.h"
 #include "core/util/lazy_ptr.h"
+#include "pcs/pcs_runtime.h"
 
 namespace rocr {
 namespace AMD {
@@ -186,6 +188,24 @@ class GpuAgentInt : public core::Agent {
   // Only valid when async scratch reclaim is supported
   // @retval HSA_STATUS_SUCCESS if successful
   virtual hsa_status_t SetAsyncScratchThresholds(size_t use_once_limit) = 0;
+
+  // @brief Iterate through supported PC Sampling configurations
+  // @retval HSA_STATUS_SUCCESS if successful
+  virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
+                                               void* cb_data) = 0;
+
+  virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+
+  virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
+                                              pcs::PcsRuntime::PcSamplingSession& session) = 0;
+
+  virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+
+  virtual hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+
+  virtual hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) = 0;
+
+  virtual hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) = 0;
 };
 
 class GpuAgent : public GpuAgentInt {
@@ -380,6 +400,9 @@ class GpuAgent : public GpuAgentInt {
     if (t0_.GPUClockCounter == t1_.GPUClockCounter) SyncClocks();
   }
 
+  const size_t MAX_SCRATCH_APERTURE_PER_XCC = (1ULL << 32);
+  size_t MaxScratchDevice() const { return properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC; }
+
   void ReserveScratch();
 
   // @brief If agent supports it, release scratch memory for all AQL queues on this agent.
@@ -408,6 +431,13 @@ class GpuAgent : public GpuAgentInt {
 
   const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
 
+  const std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)>&
+  finegrain_allocator() const {
+    return finegrain_allocator_;
+  }
+
+  const std::function<void(void*)>& finegrain_deallocator() const { return finegrain_deallocator_; }
+
  protected:
   // Sizes are in packets.
   static const uint32_t minAqlSize_ = 0x40;     // 4KB min
@@ -452,10 +482,25 @@ class GpuAgent : public GpuAgentInt {
 
   // @brief Binds the second-level trap handler to this node.
   void BindTrapHandler();
+  hsa_status_t UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* stochastic_hosttrap_buffers);
 
   // @brief Override from core::Agent.
   hsa_status_t EnableDmaProfiling(bool enable) override;
 
+  hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
+                                       void* cb_data);
+  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
+                                      pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
+  hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
+
+  static void PcSamplingThreadRun(void* agent);
+  void PcSamplingThread();
+
   // @brief Node properties.
   const HsaNodeProperties properties_;
 
@@ -494,8 +539,9 @@ class GpuAgent : public GpuAgentInt {
 
   // @brief AQL queues for cache management and blit compute usage.
   enum QueueEnum {
-    QueueUtility,   // Cache management and device to {host,device} blit compute
-    QueueBlitOnly,  // Host to device blit
+    QueueUtility,     // Cache management and device to {host,device} blit compute
+    QueueBlitOnly,    // Host to device blit
+    QueuePCSampling,  // Dedicated high priority queue for PC Sampling
     QueueCount
   };
 
@@ -578,8 +624,8 @@ class GpuAgent : public GpuAgentInt {
   // @brief Setup GWS accessing queue.
   void InitGWS();
 
-  // @brief Setup NUMA aware system memory allocator.
-  void InitNumaAllocator();
+  // @brief Set-up memory allocators
+  void InitAllocators();
 
   // @brief Initialize scratch handler thresholds
   void InitAsyncScratchThresholds();
@@ -654,6 +700,58 @@ class GpuAgent : public GpuAgentInt {
 
   std::function<void(void*)> system_deallocator_;
 
+  // Fine grain allocator on this device
+  std::function<void*(size_t size, core::MemoryRegion::AllocateFlags flags)> finegrain_allocator_;
+
+  std::function<void(void*)> finegrain_deallocator_;
+
+  void* trap_handler_tma_region_;
+
+  /* PC Sampling fields - begin */
+  /* 2nd level Trap handler code is based on the offsets within this structure */
+  typedef struct {
+    uint64_t buf_write_val;
+    uint32_t buf_size;
+    uint32_t reserved0;
+    uint32_t buf_written_val0;
+    uint32_t buf_watermark0;
+    hsa_signal_t done_sig0;
+    uint32_t buf_written_val1;
+    uint32_t buf_watermark1;
+    hsa_signal_t done_sig1;
+    uint8_t reserved1[16];
+    /* pc_sample_t buffer0[buf_size]; */
+    /* pc_sample_t buffer1[buf_size]; */
+  } pcs_hosttrap_sampling_data_t;
+
+  typedef struct {
+    /* Hosttrap data - stored on device so that trap_handler code can access efficiently */
+    pcs_hosttrap_sampling_data_t* device_data;
+
+    /* Hosttrap host buffer - stored on host */
+    uint8_t* host_buffer;
+    size_t host_buffer_size;
+    uint8_t* host_buffer_wrap_pos;
+    uint8_t* host_write_ptr;
+    uint8_t* host_read_ptr;
+    size_t lost_sample_count;
+    std::mutex host_buffer_mutex;
+
+    uint32_t which_buffer;
+    uint64_t* old_val;
+    uint32_t* cmd_data;
+    size_t cmd_data_sz;
+    // signal to pass into ExecutePM4() so that we do not need to re-allocate a
+    // new signal on each call
+    hsa_signal_t exec_pm4_signal;
+
+    os::Thread thread;
+    pcs::PcsRuntime::PcSamplingSession* session;
+  } pcs_hosttrap_t;
+
+  pcs_hosttrap_t pcs_hosttrap_data_;
+  /* PC Sampling fields - end */
+
   // @brief device handle
   amdgpu_device_handle ldrm_dev_;
 
diff --git a/src/core/inc/amd_gpu_pm4.h b/src/core/inc/amd_gpu_pm4.h
index 7ebf0c399..65191d5bb 100644
--- a/src/core/inc/amd_gpu_pm4.h
+++ b/src/core/inc/amd_gpu_pm4.h
@@ -43,11 +43,19 @@
 #ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_PM4_H_
 #define HSA_RUNTIME_CORE_INC_AMD_GPU_PM4_H_
 
+ // clang-format off
+
 #define PM4_HDR_IT_OPCODE_NOP                             0x10
 #define PM4_HDR_IT_OPCODE_INDIRECT_BUFFER                 0x3F
 #define PM4_HDR_IT_OPCODE_RELEASE_MEM                     0x49
 #define PM4_HDR_IT_OPCODE_ACQUIRE_MEM                     0x58
 
+#define PM4_HDR_IT_OPCODE_ATOMIC_MEM                      0x1E
+#define PM4_HDR_IT_OPCODE_WRITE_DATA                      0x37
+#define PM4_HDR_IT_OPCODE_WAIT_REG_MEM                    0x3C
+#define PM4_HDR_IT_OPCODE_COPY_DATA                       0x40
+#define PM4_HDR_IT_OPCODE_DMA_DATA                        0x50
+
 #define PM4_HDR_SHADER_TYPE(x)                            (((x) & 0x1) << 1)
 #define PM4_HDR_IT_OPCODE(x)                              (((x) & 0xFF) << 8)
 #define PM4_HDR_COUNT(x)                                  (((x) & 0x3FFF) << 16)
@@ -82,4 +90,51 @@
 #define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x)                 (((x) & 0xF) << 8)
 #  define PM4_RELEASE_MEM_EVENT_INDEX_AQL                  0x7
 
+#define PM4_ATOMIC_MEM_DW1_ATOMIC(x)                       (((x) & 0x7F) << 0)
+#  define PM4_ATOMIC_MEM_GL2_OP_ATOMIC_SWAP_RTN_64         (39 << 0)
+#define PM4_ATOMIC_MEM_DW2_ADDR_LO(x)                      (((x) & 0xFFFFFFF8) << 0)
+#define PM4_ATOMIC_MEM_DW3_ADDR_HI(x)                      (((x) & 0xFFFFFFFF) << 0)
+#define PM4_ATOMIC_MEM_DW4_SRC_DATA_LO(x)                  (((x) & 0xFFFFFFFF) << 0)
+#define PM4_ATOMIC_MEM_DW5_SRC_DATA_HI(x)                  (((x) & 0xFFFFFFFF) << 0)
+
+#define PM4_COPY_DATA_DW1(x)                               (((x) & 0xFFFFFFFF) << 0)
+#  define PM4_COPY_DATA_SRC_SEL_ATOMIC_RETURN_DATA         (6 << 0)
+#  define PM4_COPY_DATA_DST_SEL_TC_12                      (2 << 8)
+#  define PM4_COPY_DATA_COUNT_SEL                          (1 << 16)
+#  define PM4_COPY_DATA_WR_CONFIRM                         (1 << 20)
+#define PM4_COPY_DATA_DW4_DST_ADDR_LO(x)                   (((x) & 0xFFFFFFF8) << 0)
+#define PM4_COPY_DATA_DW5_DST_ADDR_HI(x)                   (((x) & 0xFFFFFFFF) << 0)
+
+#define PM4_WAIT_REG_MEM_DW1(x)                            (((x) & 0xFFFFFFFF) << 0)
+#  define PM4_WAIT_REG_MEM_FUNCTION_EQUAL_TO_REFERENCE     (3 << 0)
+#  define PM4_WAIT_REG_MEM_MEM_SPACE_MEMORY_SPACE          (1 << 4)
+#  define PM4_WAIT_REG_MEM_OPERATION_WAIT_REG_MEM          (0 << 6)
+#define PM4_WAIT_REG_MEM_DW2_MEM_POLL_ADDR_LO(x)           (((x) & 0xFFFFFFFC) << 0)
+#define PM4_WAIT_REG_MEM_DW3_MEM_POLL_ADDR_HI(x)           (((x) & 0xFFFFFFFF) << 0)
+#define PM4_WAIT_REG_MEM_DW4_REFERENCE(x)                  (((x) & 0xFFFFFFFF) << 0)
+#define PM4_WAIT_REG_MEM_DW6(x)                            (((x) & 0x8000FFFF) << 0)
+#  define PM4_WAIT_REG_MEM_POLL_INTERVAL(x)                (((x) & 0xFFFF) << 0)
+#  define PM4_WAIT_REG_MEM_OPTIMIZE_ACE_OFFLOAD_MODE       (1 << 31)
+
+#define PM4_DMA_DATA_DW1(x)                            (((x) & 0xFFFFFFFF) << 0)
+#  define PM4_DMA_DATA_DST_SEL_DST_ADDR_USING_L2       (3 << 20)
+#  define PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2       (3 << 29)
+#define PM4_DMA_DATA_DW2_SRC_ADDR_LO(x)                (((x) & 0xFFFFFFFF) << 0)
+#define PM4_DMA_DATA_DW3_SRC_ADDR_HI(x)                (((x) & 0xFFFFFFFF) << 0)
+#define PM4_DMA_DATA_DW4_DST_ADDR_LO(x)                (((x) & 0xFFFFFFFF) << 0)
+#define PM4_DMA_DATA_DW5_DST_ADDR_HI(x)                (((x) & 0xFFFFFFFF) << 0)
+#define PM4_DMA_DATA_DW6(x)                            (((x) & 0xFFFFFFFF) << 0)
+#  define PM4_DMA_DATA_BYTE_COUNT(x)                   (((x) & 0x3FFFFFF) << 0)
+#  define PM4_DMA_DATA_DIS_WC                          (1 << 31)
+#  define PM4_DMA_DATA_DIS_WC_LAST                     (0 << 31)
+
+#define PM4_WRITE_DATA_DW1(x)                          (((x) & 0xFFFFFF00) << 0)
+#  define PM4_WRITE_DATA_DST_SEL_TC_L2                 (2 << 8)
+#  define PM4_WRITE_DATA_WR_CONFIRM_WAIT_CONFIRMATION  (1 << 20)
+#define PM4_WRITE_DATA_DW2_DST_MEM_ADDR_LO(x)          (((x) & 0xFFFFFFFC) << 0)
+#define PM4_WRITE_DATA_DW3_DST_MEM_ADDR_HI(x)          (((x) & 0xFFFFFFFF) << 0)
+#define PM4_WRITE_DATA_DW4_DATA(x)                     (((x) & 0xFFFFFFFF) << 0)
+
+// clang-format on
+
 #endif  // header guard
diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h
deleted file mode 100644
index e5ee1c4ed..000000000
--- a/src/core/inc/amd_gpu_shaders.h
+++ /dev/null
@@ -1,901 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_SHADERS_H_
-#define HSA_RUNTIME_CORE_INC_AMD_GPU_SHADERS_H_
-
-namespace rocr {
-namespace AMD {
-
-static const unsigned int kCodeCopyAligned7[] = {
-    0xC0820100, 0xC0840104, 0xC0860108, 0xC088010C, 0xC08A0110, 0xC00C0114,
-    0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900,
-    0xD2506A03, 0x01A90103, 0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05,
-    0x01A90105, 0xD1C2006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
-    0x01000002, 0xBF8C0F70, 0xD24A6A02, 0x00003102, 0xD2506A03, 0x01A90103,
-    0xDC600000, 0x00000104, 0xD24A6A04, 0x00003104, 0xD2506A05, 0x01A90105,
-    0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, 0xD24A6A02,
-    0x00001101, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001501,
-    0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000E, 0xDC380000,
-    0x08000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70,
-    0xDC780000, 0x00000804, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105,
-    0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD24A6A02, 0x00001901,
-    0xD2506A03, 0x01A90103, 0x7E0A020F, 0xD24A6A04, 0x00001D01, 0xD2506A05,
-    0x01A90105, 0xD1C2006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, 0xDC300000,
-    0x01000002, 0xD24A6A02, 0x00003302, 0xD2506A03, 0x01A90103, 0xBF8C0F70,
-    0xDC700000, 0x00000104, 0xD24A6A04, 0x00003304, 0xD2506A05, 0x01A90105,
-    0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD24A6A02, 0x00002100, 0xD2506A03,
-    0x01A90103, 0x7E0A0213, 0xD24A6A04, 0x00002500, 0xD2506A05, 0x01A90105,
-    0xD1C2006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, 0x01000002,
-    0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000,
-};
-
-static const unsigned int kCodeCopyMisaligned7[] = {
-    0xC0820100, 0xC0840104, 0xC0860108, 0xC008010C, 0xBF8C007F, 0x8F028602,
-    0x4A000002, 0x7E060205, 0xD24A6A02, 0x00000900, 0xD2506A03, 0x01A90103,
-    0x7E0A0207, 0xD24A6A04, 0x00000D00, 0xD2506A05, 0x01A90105, 0xD1C2006A,
-    0x00001102, 0xBF860032, 0xDC200000, 0x06000002, 0xD24A6A02, 0x00002102,
-    0xD2506A03, 0x01A90103, 0xDC200000, 0x07000002, 0xD24A6A02, 0x00002102,
-    0xD2506A03, 0x01A90103, 0xDC200000, 0x08000002, 0xD24A6A02, 0x00002102,
-    0xD2506A03, 0x01A90103, 0xDC200000, 0x09000002, 0xD24A6A02, 0x00002102,
-    0xD2506A03, 0x01A90103, 0xBF8C0F70, 0xDC600000, 0x00000604, 0xD24A6A04,
-    0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000704, 0xD24A6A04,
-    0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000804, 0xD24A6A04,
-    0x00002104, 0xD2506A05, 0x01A90105, 0xDC600000, 0x00000904, 0xD24A6A04,
-    0x00002104, 0xD2506A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD24A6A02,
-    0x00001100, 0xD2506A03, 0x01A90103, 0x7E0A020B, 0xD24A6A04, 0x00001500,
-    0xD2506A05, 0x01A90105, 0xD1C2006A, 0x00001902, 0xBF86000F, 0x87FE6A7E,
-    0xDC200000, 0x01000002, 0xD24A6A02, 0x00002102, 0xD2506A03, 0x01A90103,
-    0xBF8C0F70, 0xDC600000, 0x00000104, 0xD24A6A04, 0x00002104, 0xD2506A05,
-    0x01A90105, 0xBF82FFEE, 0xBF810000,
-};
-
-static const unsigned int kCodeFill7[] = {
-    0xC0820100, 0xC0840104, 0xBF8C007F, 0x8F028602, 0x4A000002, 0x7E08020A,
-    0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8F0C840B, 0x34020084, 0x7E060205,
-    0xD24A6A02, 0x00000901, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00000D02,
-    0xBF860007, 0xDC780000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03,
-    0x01A90103, 0xBF82FFF6, 0x8F0C820B, 0x34020082, 0x7E060207, 0xD24A6A02,
-    0x00000D01, 0xD2506A03, 0x01A90103, 0xD1C2006A, 0x00001102, 0xBF860008,
-    0x87FE6A7E, 0xDC700000, 0x00000402, 0xD24A6A02, 0x00001902, 0xD2506A03,
-    0x01A90103, 0xBF82FFF5, 0xBF810000,
-};
-
-static const unsigned int kCodeTrapHandler8[] = {
-    0xC0061C80, 0x000000C0, 0xBF8C007F, 0xBEFE0181, 0x80728872, 0x82738073,
-    0x7E000272, 0x7E020273, 0x7E0402FF, 0x80000000, 0x7E060280, 0xDD800000,
-    0x00000200, 0xBF8C0F70, 0x7DD40500, 0xBF870011, 0xC0061D39, 0x00000008,
-    0xBF8C007F, 0x86F47474, 0xBF84000C, 0x80729072, 0x82738073, 0xC0021CB9,
-    0x00000000, 0xBF8C007F, 0x7E000274, 0x7E020275, 0x7E040272, 0xDC700000,
-    0x00000200, 0xBF8C0F70, 0xBF900001, 0xBF8D0001, 0xBE801F70,
-};
-
-static const unsigned int kCodeTrapHandler9[] = {
-/*
-  .set SQ_WAVE_PC_HI_ADDRESS_MASK              , 0xFFFF
-  .set SQ_WAVE_PC_HI_TRAP_ID_SHIFT             , 16
-  .set SQ_WAVE_PC_HI_TRAP_ID_SIZE              , 8
-  .set SQ_WAVE_PC_HI_TRAP_ID_BFE               , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
-  .set SQ_WAVE_PC_HI_HT_MASK                   , 0x1000000
-  .set SQ_WAVE_STATUS_HALT_BIT                 , 13
-  .set SQ_WAVE_STATUS_HALT_BFE                 , (SQ_WAVE_STATUS_HALT_BIT | (1 << 16))
-  .set SQ_WAVE_TRAPSTS_ADDRESS_WATCH_MASK      , 0x7080
-  .set SQ_WAVE_TRAPSTS_MEM_VIOL_MASK           , 0x100
-  .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK       , 0x800
-  .set SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK        , 0x10000000
-  .set SQ_WAVE_MODE_DEBUG_EN_SHIFT             , 11
-  .set SIGNAL_CODE_MEM_VIOL                    , (1 << 29)
-  .set SIGNAL_CODE_ILLEGAL_INST                , (1 << 30)
-  .set SIGNAL_CODE_LLVM_TRAP                   , (1 << 31)
-  .set MAX_NUM_DOORBELLS_MASK                  , ((1 << 10) - 1)
-  .set SENDMSG_M0_DOORBELL_ID_BITS             , 12
-  .set SENDMSG_M0_DOORBELL_ID_MASK             , ((1 << SENDMSG_M0_DOORBELL_ID_BITS) - 1)
-
-  .set TTMP7_DISPATCH_ID_CONVERTED_BIT         , 31
-  .set TTMP7_WAVE_STOPPED_BIT                  , 30
-  .set TTMP7_SAVED_STATUS_HALT_BIT             , 29
-  .set TTMP7_SAVED_TRAP_ID_SHIFT               , 25
-  .set TTMP7_SAVED_TRAP_ID_BITS                , 4
-  .set TTMP7_SAVED_TRAP_ID_MASK                , ((1 << TTMP7_SAVED_TRAP_ID_BITS) - 1)
-  .set TTMP7_PACKET_INDEX_BITS                 , 25
-  .set TTMP7_PACKET_INDEX_MASK                 , ((1 << TTMP7_PACKET_INDEX_BITS) - 1)
-  .set TTMP11_PC_HI_SHIFT                      , 7
-
-  .if .amdgcn.gfx_generation_number == 9
-    .set DEBUG_INTERRUPT_CONTEXT_ID_BIT        , 23
-    .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 26
-    .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
-    .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000
-  .elseif .amdgcn.gfx_generation_number == 10
-    .set DEBUG_INTERRUPT_CONTEXT_ID_BIT        , 22
-    .set TTMP11_SAVE_REPLAY_W64H_SHIFT         , 31
-    .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 24
-    .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT      , 25
-    .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
-    .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000
-    .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK       , 0x2000000
-  .else
-    .error "unsupported target"
-  .endif
-
-  // ABI between first and second level trap handler:
-  //   ttmp0 = PC[31:0]
-  //   ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
-  //   ttmp12 = SQ_WAVE_STATUS
-  //   ttmp14 = TMA[31:0]
-  //   ttmp15 = TMA[63:32]
-  // gfx9:
-  //   ttmp11 = SQ_WAVE_IB_STS[20:15], 0[18:0], NoScratch[0], WaveIdInWG[5:0]
-  // gfx10:
-  //   ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[16:0], NoScratch[0], WaveIdInWG[5:0]
-
-  .macro mGetDoorbellId
-    s_mov_b32            exec_lo, 0x80000000
-    s_sendmsg            sendmsg(MSG_GET_DOORBELL)
-  .wait_sendmsg_\@:
-    s_nop                7
-    s_bitcmp0_b32        exec_lo, 0x1F
-    s_cbranch_scc0       .wait_sendmsg_\@
-  .endm
-
-  .macro mExitTrap
-    // Restore SQ_WAVE_IB_STS.
-  .if .amdgcn.gfx_generation_number == 9
-    s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
-    s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
-    s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
-  .endif
-  .if .amdgcn.gfx_generation_number == 10
-    s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
-    s_and_b32            ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
-    s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
-    s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
-    s_or_b32             ttmp2, ttmp2, ttmp3
-    s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
-  .endif
-
-    // Restore SQ_WAVE_STATUS.
-    s_and_b64            exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
-    s_and_b64            vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
-    s_setreg_b32         hwreg(HW_REG_STATUS), ttmp12
-
-    // Return to shader at unmodified PC.
-    s_rfe_b64            [ttmp0, ttmp1]
-  .endm
-
-  trap_entry:
-    s_andn2_b32          ttmp7, ttmp7, (TTMP7_SAVED_TRAP_ID_MASK << TTMP7_SAVED_TRAP_ID_SHIFT) | (1 << TTMP7_SAVED_STATUS_HALT_BIT)
-
-    // Save the entry status.halt in ttmp7.saved_status_halt
-    s_bfe_u32            ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE
-    s_lshl_b32           ttmp2, ttmp2, TTMP7_SAVED_STATUS_HALT_BIT
-    s_or_b32             ttmp7, ttmp7, ttmp2
-
-    // If trap raised (non-zero trap id) then branch.
-    s_bfe_u32            ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
-    s_cbranch_scc1       .trap_raised
-
-    // If non-masked exception raised then branch.
-    s_getreg_b32         ttmp2, hwreg(HW_REG_TRAPSTS)
-    s_and_b32            ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK)
-    s_cbranch_scc1       .excp_raised
-
-  .signal_debugger:
-    // Fetch doorbell index for our queue.
-    s_mov_b32            ttmp2, exec_lo
-    s_mov_b32            ttmp3, exec_hi
-    mGetDoorbellId
-    s_mov_b32            exec_hi, ttmp3
-
-    // Restore exec_lo, move the doorbell_id into ttmp3
-    s_and_b32            ttmp3, exec_lo, SENDMSG_M0_DOORBELL_ID_MASK
-    s_mov_b32            exec_lo, ttmp2
-
-    // Set the debug interrupt context id.
-    // FIXME: Make conditional when exceptions are handled.
-    s_bitset1_b32        ttmp3, DEBUG_INTERRUPT_CONTEXT_ID_BIT
-
-    // Send an interrupt to trigger event notification.
-    s_mov_b32            ttmp2, m0
-    s_mov_b32            m0, ttmp3
-    s_nop                0x0 // Manually inserted wait states
-    s_sendmsg            sendmsg(MSG_INTERRUPT)
-
-    // Restore m0
-    s_mov_b32            m0, ttmp2
-
-    // Parking the wave requires saving the original pc in the preserved ttmps.
-    // Since all ttmps are used, we must first free ttmp6 by compressing the
-    // 40bit dispatch ptr in ttmp6:7 into a 25bit queue packet id.
-    //
-    // Register layout before parking the wave:
-    //
-    // ttmp6: dispatch_ptr[31:6] 0[5:0]
-    // ttmp7: 0[0] wave_stopped[0] status_halt[0] trap_id[3:0] 0[16:0] dispatch_ptr[39:32]
-    // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
-    //
-    // After parking the wave:
-    //
-    // ttmp6:  pc_lo[31:0]
-    // ttmp7:  1[0] wave_stopped[0] status_halt[0] trap_id[3:0] packet_id[24:0]
-    // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
-    //
-    // The conversion from dispatch ptr to queue packet index only needs to be
-    // done once, the first time the wave executes the trap handler.
-
-  .if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10)
-    s_branch             .halt_wave
-  .else
-    s_bitcmp1_b32        ttmp7, TTMP7_DISPATCH_ID_CONVERTED_BIT
-    s_cbranch_scc1       .ttmp7_has_dispatch_index
-
-    s_and_b32            ttmp3, ttmp3, MAX_NUM_DOORBELLS_MASK
-    s_lshl_b32           ttmp3, ttmp3, 0x3
-
-    // Map doorbell index to amd_queue_t* through TMA (doorbell_queue_map).
-    s_load_dwordx2       [ttmp2, ttmp3], [ttmp14, ttmp15], ttmp3 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // Retrieve queue base_address from hsa_queue_t*.
-    s_load_dword         ttmp2, [ttmp2, ttmp3], 0x8 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // The dispatch index is (dispatch_ptr.lo - base_address.lo) >> 6
-    s_sub_u32            ttmp2, ttmp6, ttmp2
-    s_lshr_b32           ttmp2, ttmp2, 0x6
-    s_andn2_b32          ttmp7, ttmp7, TTMP7_PACKET_INDEX_MASK
-    s_or_b32             ttmp7, ttmp7, ttmp2
-    s_bitset1_b32        ttmp7, TTMP7_DISPATCH_ID_CONVERTED_BIT
-
-  .ttmp7_has_dispatch_index:
-    // Save the PC
-    s_mov_b32            ttmp6, ttmp0
-    s_and_b32            ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
-    s_lshl_b32           ttmp1, ttmp1, TTMP11_PC_HI_SHIFT
-    s_andn2_b32          ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT)
-    s_or_b32             ttmp11, ttmp11, ttmp1
-
-    // Park the wave
-    s_getpc_b64          [ttmp0, ttmp1]
-    s_add_u32            ttmp0, ttmp0, .parked - .
-    s_addc_u32           ttmp1, ttmp1, 0x0
-    s_branch             .halt_wave
-
-  .parked:
-    s_trap               0x2
-    s_branch             .parked
-  .endif
-
-  .excp_raised:
-    // If memory violation without XNACK error then signal queue error.
-    // XNACK error will be handled by VM interrupt, since it has more information.
-    s_and_b32            ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK)
-    s_cmp_eq_u32         ttmp3, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
-    s_mov_b32            ttmp3, SIGNAL_CODE_MEM_VIOL
-    s_cbranch_scc1       .signal_error
-
-    // If illegal instruction then signal queue error.
-    s_and_b32            ttmp3, ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
-    s_mov_b32            ttmp3, SIGNAL_CODE_ILLEGAL_INST
-    s_cbranch_scc1       .signal_error
-
-    // Otherwise (memory violation with XNACK error) return to shader. Do not
-    // send a signal as that will cause an interrupt storm. Instead let the
-    // interrupt generated by the TLB miss cause the kernel to notify ROCr and
-    // put the queue into an error state. This also ensures the TLB interrupt
-    // is received which provides information about the page causing the fault.
-    s_branch             .halt_wave
-
-  .trap_raised:
-    // Save the entry trap id in ttmp7.saved_trap_id
-    s_min_u32            ttmp3, ttmp2, 0xF
-    s_lshl_b32           ttmp3, ttmp3, TTMP7_SAVED_TRAP_ID_SHIFT
-    s_or_b32             ttmp7, ttmp7, ttmp3
-
-    // If debugger trap (s_trap >= 3) then signal debugger.
-    s_cmp_ge_u32         ttmp2, 0x3;
-    s_cbranch_scc1       .signal_debugger
-
-    // If llvm.trap (s_trap 2) then signal queue error.
-    s_cmp_eq_u32         ttmp2, 0x2
-    s_mov_b32            ttmp3, SIGNAL_CODE_LLVM_TRAP
-    s_cbranch_scc1       .signal_error
-
-    // For other traps advance PC and return to shader.
-    s_add_u32            ttmp0, ttmp0, 0x4
-    s_addc_u32           ttmp1, ttmp1, 0x0
-    s_branch             .exit_trap
-
-  .signal_error:
-  .if (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3)
-    // This needs to be rewritten for gfx10.3 as scalar stores are not available.
-  .else
-    // FIXME: don't trash ttmp4/ttmp5 when exception handling is unified.
-    s_mov_b32            ttmp4, ttmp3
-
-    // Fetch doorbell index for our queue.
-    s_mov_b32            ttmp2, exec_lo
-    s_mov_b32            ttmp3, exec_hi
-    mGetDoorbellId
-    s_mov_b32            exec_hi, ttmp3
-
-    // Restore exec_lo, move the doorbell index into ttmp3
-    s_and_b32            exec_lo, exec_lo, MAX_NUM_DOORBELLS_MASK
-    s_lshl_b32           ttmp3, exec_lo, 0x3
-    s_mov_b32            exec_lo, ttmp2
-
-    // Map doorbell index to amd_queue_t* through TMA (doorbell_queue_map).
-    s_load_dwordx2       [ttmp2, ttmp3], [ttmp14, ttmp15], ttmp3 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // Retrieve queue_inactive_signal from amd_queue_t*.
-    s_load_dwordx2       [ttmp2, ttmp3], [ttmp2, ttmp3], 0xC0 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // Set queue signal value to error code.
-    s_mov_b32            ttmp5, 0x0
-    s_atomic_swap_x2     [ttmp4, ttmp5], [ttmp2, ttmp3], 0x8 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // Skip event trigger if the signal value was already non-zero.
-    s_or_b32             ttmp4, ttmp4, ttmp5
-    s_cbranch_scc1       .skip_event_trigger
-
-    // Check for a non-NULL signal event mailbox.
-    s_load_dwordx2       [ttmp4, ttmp5], [ttmp2, ttmp3], 0x10 glc
-    s_waitcnt            lgkmcnt(0)
-    s_and_b64            [ttmp4, ttmp5], [ttmp4, ttmp5], [ttmp4, ttmp5]
-    s_cbranch_scc0       .skip_event_trigger
-
-    // Load the signal event value.
-    s_load_dword         ttmp2, [ttmp2, ttmp3], 0x18 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // Write the signal event value to the mailbox.
-    s_store_dword        ttmp2, [ttmp4, ttmp5], 0x0 glc
-    s_waitcnt            lgkmcnt(0)
-
-    // Send an interrupt to trigger event notification.
-    s_mov_b32            m0, 0x0
-    s_nop                0
-    s_sendmsg            sendmsg(MSG_INTERRUPT)
-  .endif
-
-  .skip_event_trigger:
-    // Since we trashed ttmp4/ttmp5, reset the wave_id to 0
-    s_mov_b32            ttmp4, 0x0
-    s_mov_b32            ttmp5, 0x0
-
-  .halt_wave:
-    s_bitset1_b32        ttmp7, TTMP7_WAVE_STOPPED_BIT
-
-    // Halt the wavefront.
-    s_bitset1_b32        ttmp12, SQ_WAVE_STATUS_HALT_BIT
-
-  .exit_trap:
-    mExitTrap
-*/
-    0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73,
-    0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900,
-    0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a,
-    0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff,
-    0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001,
-    0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f,
-    0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f,
-    0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f,
-    0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80,
-    0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044,
-    0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
-    0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4,
-    0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e,
-    0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c,
-    0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff,
-    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f,
-    0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f,
-    0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37,
-    0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010,
-    0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f,
-    0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001,
-    0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e,
-    0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
-};
-
-static const unsigned int kCodeTrapHandler90a[] = {
-    0x8973ff73, 0x3e000000, 0x92eeff78, 0x0001000d, 0x8e6e9d6e, 0x87736e73,
-    0x92eeff6d, 0x00080010, 0xbf850041, 0xb8eef803, 0x866fff6e, 0x00000900,
-    0xbf850031, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, 0xbf90000a,
-    0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x866fff7e, 0x00000fff,
-    0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001,
-    0xbefc006e, 0xbf0d9f73, 0xbf85000f, 0x866fff6f, 0x000003ff, 0x8e6f836f,
-    0xc0051bbd, 0x0000006f, 0xbf8cc07f, 0xc0031bb7, 0x00000008, 0xbf8cc07f,
-    0x80ee6e72, 0x8f6e866e, 0x8973ff73, 0x01ffffff, 0x87736e73, 0xbef31a9f,
-    0xbef2006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80,
-    0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044,
-    0xbf920002, 0xbf82fffe, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
-    0xbeef00ff, 0x20000000, 0xbf850011, 0x866fff6e, 0x00000800, 0xbeef00f4,
-    0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8e6f996f, 0x87736f73, 0xbf09836e,
-    0xbf85ffbe, 0xbf06826e, 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c,
-    0x826d806d, 0xbf82002c, 0xbef0006f, 0xbeee007e, 0xbeef007f, 0xbefe00ff,
-    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f,
-    0x867eff7e, 0x000003ff, 0x8e6f837e, 0xbefe006e, 0xc0051bbd, 0x0000006f,
-    0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37,
-    0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010,
-    0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f,
-    0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001,
-    0xbef00080, 0xbef10080, 0xbef31a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e,
-    0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
-};
-
-static const unsigned int kCodeCopyAligned8[] = {
-    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
-    0xC00A0400, 0x00000030, 0xC00A0500, 0x00000040, 0xC0020600, 0x00000050,
-    0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205, 0xD1196A02, 0x00000900,
-    0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04, 0x00000D00, 0xD11C6A05,
-    0x01A90105, 0xD0E9006A, 0x00001102, 0xBF86000F, 0x86FE6A7E, 0xDC400000,
-    0x01000002, 0xBF8C0F70, 0xD1196A02, 0x00003102, 0xD11C6A03, 0x01A90103,
-    0xDC600000, 0x00000104, 0xD1196A04, 0x00003104, 0xD11C6A05, 0x01A90105,
-    0xBF82FFEE, 0xBEFE01C1, 0x8E198418, 0x24020084, 0x7E060209, 0xD1196A02,
-    0x00001101, 0xD11C6A03, 0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001501,
-    0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001902, 0xBF86000E, 0xDC5C0000,
-    0x08000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70,
-    0xDC7C0000, 0x00000804, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105,
-    0xBF82FFEF, 0x8E198218, 0x24020082, 0x7E06020D, 0xD1196A02, 0x00001901,
-    0xD11C6A03, 0x01A90103, 0x7E0A020F, 0xD1196A04, 0x00001D01, 0xD11C6A05,
-    0x01A90105, 0xD0E9006A, 0x00002102, 0xBF86000F, 0x86FE6A7E, 0xDC500000,
-    0x01000002, 0xD1196A02, 0x00003302, 0xD11C6A03, 0x01A90103, 0xBF8C0F70,
-    0xDC700000, 0x00000104, 0xD1196A04, 0x00003304, 0xD11C6A05, 0x01A90105,
-    0xBF82FFEE, 0xBEFE01C1, 0x7E060211, 0xD1196A02, 0x00002100, 0xD11C6A03,
-    0x01A90103, 0x7E0A0213, 0xD1196A04, 0x00002500, 0xD11C6A05, 0x01A90105,
-    0xD0E9006A, 0x00002902, 0xBF860006, 0x86FE6A7E, 0xDC400000, 0x01000002,
-    0xBF8C0F70, 0xDC600000, 0x00000104, 0xBF810000,
-};
-
-static const unsigned int kCodeCopyMisaligned8[] = {
-    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xC00A0300, 0x00000020,
-    0xC0020400, 0x00000030, 0xBF8C007F, 0x8E028602, 0x32000002, 0x7E060205,
-    0xD1196A02, 0x00000900, 0xD11C6A03, 0x01A90103, 0x7E0A0207, 0xD1196A04,
-    0x00000D00, 0xD11C6A05, 0x01A90105, 0xD0E9006A, 0x00001102, 0xBF860032,
-    0xDC400000, 0x06000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
-    0xDC400000, 0x07000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
-    0xDC400000, 0x08000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
-    0xDC400000, 0x09000002, 0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103,
-    0xBF8C0F70, 0xDC600000, 0x00000604, 0xD1196A04, 0x00002104, 0xD11C6A05,
-    0x01A90105, 0xDC600000, 0x00000704, 0xD1196A04, 0x00002104, 0xD11C6A05,
-    0x01A90105, 0xDC600000, 0x00000804, 0xD1196A04, 0x00002104, 0xD11C6A05,
-    0x01A90105, 0xDC600000, 0x00000904, 0xD1196A04, 0x00002104, 0xD11C6A05,
-    0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD1196A02, 0x00001100, 0xD11C6A03,
-    0x01A90103, 0x7E0A020B, 0xD1196A04, 0x00001500, 0xD11C6A05, 0x01A90105,
-    0xD0E9006A, 0x00001902, 0xBF86000F, 0x86FE6A7E, 0xDC400000, 0x01000002,
-    0xD1196A02, 0x00002102, 0xD11C6A03, 0x01A90103, 0xBF8C0F70, 0xDC600000,
-    0x00000104, 0xD1196A04, 0x00002104, 0xD11C6A05, 0x01A90105, 0xBF82FFEE,
-    0xBF810000,
-};
-
-static const unsigned int kCodeFill8[] = {
-    0xC00A0100, 0x00000000, 0xC00A0200, 0x00000010, 0xBF8C007F, 0x8E028602,
-    0x32000002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, 0x8E0C840B,
-    0x24020084, 0x7E060205, 0xD1196A02, 0x00000901, 0xD11C6A03, 0x01A90103,
-    0xD0E9006A, 0x00000D02, 0xBF860007, 0xDC7C0000, 0x00000402, 0xD1196A02,
-    0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF6, 0x8E0C820B, 0x24020082,
-    0x7E060207, 0xD1196A02, 0x00000D01, 0xD11C6A03, 0x01A90103, 0xD0E9006A,
-    0x00001102, 0xBF860008, 0x86FE6A7E, 0xDC700000, 0x00000402, 0xD1196A02,
-    0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
-};
-
-static const unsigned int kCodeCopyAligned10[] = {
-    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
-    0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
-    0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02,
-    0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006,
-    0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E,
-    0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03,
-    0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05,
-    0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209,
-    0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04,
-    0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E,
-    0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
-    0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05,
-    0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02,
-    0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E,
-    0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E,
-    0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
-    0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05,
-    0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010,
-    0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05,
-    0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000,
-    0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000,
-};
-
-static const unsigned int kCodeCopyMisaligned10[] = {
-    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
-    0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002,
-    0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207,
-    0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102,
-    0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008,
-    0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05,
-    0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
-    0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70,
-    0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105,
-    0xBF82FFEE, 0xBF810000,
-};
-
-static const unsigned int kCodeFill10[] = {
-    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602,
-    0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
-    0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03,
-    0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402,
-    0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B,
-    0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103,
-    0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402,
-    0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
-};
-
-static const unsigned int kCodeTrapHandler1010[] = {
-    0x8a73ff73, 0x3e000000, 0x93eeff78, 0x0001000d, 0x8f6e9d6e, 0x88736e73,
-    0x93eeff6d, 0x00080010, 0xbf850041, 0xb96ef803, 0x876fff6e, 0x00000900,
-    0xbf850031, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a,
-    0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x876fff7e, 0x00000fff,
-    0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001,
-    0xbefc036e, 0xbf0d9f73, 0xbf85000f, 0x876fff6f, 0x000003ff, 0x8f6f836f,
-    0xf4051bbd, 0xde000000, 0xbf8cc07f, 0xf4011bb7, 0xfa000008, 0xbf8cc07f,
-    0x80ee6e72, 0x906e866e, 0x8a73ff73, 0x01ffffff, 0x88736e73, 0xbef31d9f,
-    0xbef2036c, 0x876dff6d, 0x0000ffff, 0x8f6d876d, 0x8a77ff77, 0x007fff80,
-    0x88776d77, 0xbeec1f00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820044,
-    0xbf920002, 0xbf82fffe, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
-    0xbeef03ff, 0x20000000, 0xbf850011, 0x876fff6e, 0x00000800, 0xbeef03f4,
-    0xbf85000d, 0xbf820036, 0x83ef8f6e, 0x8f6f996f, 0x88736f73, 0xbf09836e,
-    0xbf85ffbe, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c,
-    0x826d806d, 0xbf82002c, 0xbef0036f, 0xbeee037e, 0xbeef037f, 0xbefe03ff,
-    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f,
-    0x877eff7e, 0x000003ff, 0x8f6f837e, 0xbefe036e, 0xf4051bbd, 0xde000000,
-    0xbf8cc07f, 0xf4051bb7, 0xfa0000c0, 0xbf8cc07f, 0xbef10380, 0xf6811c37,
-    0xfa000008, 0xbf8cc07f, 0x88707170, 0xbf85000e, 0xf4051c37, 0xfa000010,
-    0xbf8cc07f, 0x87f07070, 0xbf840009, 0xf4011bb7, 0xfa000018, 0xbf8cc07f,
-    0xf4411bb8, 0xfa000000, 0xbf8cc07f, 0xbefc0380, 0xbf800000, 0xbf900001,
-    0xbef00380, 0xbef10380, 0xbef31d9e, 0xbef81d8d, 0x906e8977, 0x876fff6e,
-    0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, 0xb9eef807,
-    0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c,
-};
-
-static const unsigned int kCodeTrapHandler10[] = {
-    0x8a73ff73, 0x3e000000, 0x93eeff78, 0x0001000d, 0x8f6e9d6e, 0x88736e73,
-    0x93eeff6d, 0x00080010, 0xbf850023, 0xb96ef803, 0x876fff6e, 0x00000900,
-    0xbf850013, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, 0xbf90000a,
-    0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x876fff7e, 0x00000fff,
-    0xbefe036e, 0xbeef1d96, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001,
-    0xbefc036e, 0xbf82001a, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100,
-    0xbeef03ff, 0x20000000, 0xbf850011, 0x876fff6e, 0x00000800, 0xbeef03f4,
-    0xbf85000d, 0xbf82000e, 0x83ef8f6e, 0x8f6f996f, 0x88736f73, 0xbf09836e,
-    0xbf85ffdc, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c,
-    0x826d806d, 0xbf820004, 0xbef00380, 0xbef10380, 0xbef31d9e, 0xbef81d8d,
-    0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000,
-    0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c,
-};
-
-/*
-.set SQ_WAVE_PC_HI_ADDRESS_MASK              , 0xFFFF
-.set SQ_WAVE_PC_HI_HT_SHIFT                  , 24
-.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT             , 16
-.set SQ_WAVE_PC_HI_TRAP_ID_SIZE              , 8
-.set SQ_WAVE_PC_HI_TRAP_ID_BFE               , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
-.set SQ_WAVE_STATUS_HALT_SHIFT               , 13
-.set SQ_WAVE_STATUS_HALT_BFE                 , (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16))
-.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT          , 8
-.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT      , 11
-.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT       , 28
-.set SQ_WAVE_TRAPSTS_MATH_EXCP               , 0x7F
-.set SQ_WAVE_MODE_EXCP_EN_SHIFT              , 12
-.set TRAP_ID_ABORT                           , 2
-.set TRAP_ID_DEBUGTRAP                       , 3
-.set DOORBELL_ID_SIZE                        , 10
-.set DOORBELL_ID_MASK                        , ((1 << DOORBELL_ID_SIZE) - 1)
-.set EC_QUEUE_WAVE_ABORT_M0                  , (1 << (DOORBELL_ID_SIZE + 0))
-.set EC_QUEUE_WAVE_TRAP_M0                   , (1 << (DOORBELL_ID_SIZE + 1))
-.set EC_QUEUE_WAVE_MATH_ERROR_M0             , (1 << (DOORBELL_ID_SIZE + 2))
-.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0    , (1 << (DOORBELL_ID_SIZE + 3))
-.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0       , (1 << (DOORBELL_ID_SIZE + 4))
-.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0     , (1 << (DOORBELL_ID_SIZE + 5))
-
-.set TTMP6_WAVE_STOPPED_SHIFT                , 30
-.set TTMP6_SAVED_STATUS_HALT_SHIFT           , 29
-.set TTMP6_SAVED_STATUS_HALT_MASK            , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
-.set TTMP6_SAVED_TRAP_ID_SHIFT               , 25
-.set TTMP6_SAVED_TRAP_ID_SIZE                , 4
-.set TTMP6_SAVED_TRAP_ID_MASK                , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
-.set TTMP6_SAVED_TRAP_ID_BFE                 , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
-.set TTMP11_PC_HI_SHIFT                      , 7
-.set TTMP11_DEBUG_ENABLED_SHIFT              , 23
-
-.if .amdgcn.gfx_generation_number == 9
-  .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 26
-  .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
-  .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000
-.elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3
-  .set TTMP11_SAVE_REPLAY_W64H_SHIFT         , 31
-  .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 24
-  .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT      , 25
-  .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
-  .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000
-  .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK       , 0x2000000
-.endif
-
-// ABI between first and second level trap handler:
-//   ttmp0 = PC[31:0]
-//   ttmp12 = SQ_WAVE_STATUS
-//   ttmp14 = TMA[31:0]
-//   ttmp15 = TMA[63:32]
-// gfx9:
-//   ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
-//   ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
-// gfx10:
-//   ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
-// gfx1010:
-//   ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
-// gfx1030:
-//   ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
-
-trap_entry:
-  // Branch if not a trap (an exception instead).
-  s_bfe_u32            ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
-  s_cbranch_scc0       .no_skip_debugtrap
-
-  // If caused by s_trap then advance PC.
-  s_bitcmp1_b32        ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
-  s_cbranch_scc1       .not_s_trap
-  s_add_u32            ttmp0, ttmp0, 0x4
-  s_addc_u32           ttmp1, ttmp1, 0x0
-
-.not_s_trap:
-  // If llvm.debugtrap and debugger is not attached.
-  s_cmp_eq_u32         ttmp2, TRAP_ID_DEBUGTRAP
-  s_cbranch_scc0       .no_skip_debugtrap
-  s_bitcmp0_b32        ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
-  s_cbranch_scc0       .no_skip_debugtrap
-
-  // Ignore llvm.debugtrap.
-  s_branch             .exit_trap
-
-.no_skip_debugtrap:
-  // Save trap id and halt status in ttmp6.
-  s_andn2_b32          ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
-  s_min_u32            ttmp2, ttmp2, 0xF
-  s_lshl_b32           ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
-  s_or_b32             ttmp6, ttmp6, ttmp2
-  s_bfe_u32            ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE
-  s_lshl_b32           ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
-  s_or_b32             ttmp6, ttmp6, ttmp2
-
-  // Fetch doorbell id for our queue.
-  s_mov_b32            ttmp2, exec_lo
-  s_mov_b32            ttmp3, exec_hi
-  s_mov_b32            exec_lo, 0x80000000
-  s_sendmsg            sendmsg(MSG_GET_DOORBELL)
-.wait_sendmsg:
-  s_nop                0x7
-  s_bitcmp0_b32        exec_lo, 0x1F
-  s_cbranch_scc0       .wait_sendmsg
-  s_mov_b32            exec_hi, ttmp3
-
-  // Restore exec_lo, move the doorbell_id into ttmp3
-  s_and_b32            ttmp3, exec_lo, DOORBELL_ID_MASK
-  s_mov_b32            exec_lo, ttmp2
-
-  // Map trap reason to an exception code.
-  s_getreg_b32         ttmp2, hwreg(HW_REG_TRAPSTS)
-
-  s_bitcmp1_b32        ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT
-  s_cbranch_scc0       .not_memory_violation
-  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
-
-  // Aperture violation requires XNACK_ERROR == 0.
-  s_branch             .not_aperture_violation
-
-.not_memory_violation:
-  s_bitcmp1_b32        ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT
-  s_cbranch_scc0       .not_aperture_violation
-  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
-
-.not_aperture_violation:
-  s_bitcmp1_b32        ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
-  s_cbranch_scc0       .not_illegal_instruction
-  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
-
-.not_illegal_instruction:
-  s_and_b32            ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP
-  s_cbranch_scc0       .not_math_exception
-  s_getreg_b32         ttmp7, hwreg(HW_REG_MODE)
-  s_lshl_b32           ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
-  s_and_b32            ttmp2, ttmp2, ttmp7
-  s_cbranch_scc0       .not_math_exception
-  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
-
-.not_math_exception:
-  s_bfe_u32            ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE
-  s_cmp_eq_u32         ttmp2, TRAP_ID_ABORT
-  s_cbranch_scc0       .not_abort_trap
-  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
-
-.not_abort_trap:
-  // If no other exception was flagged then report a generic error.
-  s_andn2_b32          ttmp2, ttmp3, DOORBELL_ID_MASK
-  s_cbranch_scc1       .send_interrupt
-  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
-
-.send_interrupt:
-  // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
-  s_mov_b32            ttmp2, m0
-  s_mov_b32            m0, ttmp3
-  s_nop                0x0 // Manually inserted wait states
-  s_sendmsg            sendmsg(MSG_INTERRUPT)
-  s_mov_b32            m0, ttmp2
-
-  // Parking the wave requires saving the original pc in the preserved ttmps.
-  // Register layout before parking the wave:
-  //
-  // ttmp7: 0[31:0]
-  // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
-  //
-  // After parking the wave:
-  //
-  // ttmp7:  pc_lo[31:0]
-  // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
-
-.if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10)
-  s_branch             .halt_wave
-.else
-  // Save the PC
-  s_mov_b32            ttmp7, ttmp0
-  s_and_b32            ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
-  s_lshl_b32           ttmp1, ttmp1, TTMP11_PC_HI_SHIFT
-  s_andn2_b32          ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT)
-  s_or_b32             ttmp11, ttmp11, ttmp1
-
-  // Park the wave
-  s_getpc_b64          [ttmp0, ttmp1]
-  s_add_u32            ttmp0, ttmp0, .parked - .
-  s_addc_u32           ttmp1, ttmp1, 0x0
-  s_branch             .halt_wave
-
-.parked:
-  s_trap               0x2
-  s_branch             .parked
-.endif
-
-.halt_wave:
-  // Halt the wavefront upon restoring STATUS below.
-  s_bitset1_b32        ttmp6, TTMP6_WAVE_STOPPED_SHIFT
-  s_bitset1_b32        ttmp12, SQ_WAVE_STATUS_HALT_SHIFT
-
-.exit_trap:
-  // Restore SQ_WAVE_IB_STS.
-.if .amdgcn.gfx_generation_number == 9
-  s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
-  s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
-  s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
-.endif
-.if .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3
-  s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
-  s_and_b32            ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
-  s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
-  s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
-  s_or_b32             ttmp2, ttmp2, ttmp3
-  s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
-.endif
-
-  // Restore SQ_WAVE_STATUS.
-  s_and_b64            exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
-  s_and_b64            vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
-  s_setreg_b32         hwreg(HW_REG_STATUS), ttmp12
-
-  // Return to original (possibly modified) PC.
-  s_rfe_b64            [ttmp0, ttmp1]
-*/
-
-static const unsigned int kCodeTrapHandlerV2_9[] = {
-    0x92eeff6d, 0x00080010, 0xbf840009, 0xbf0d986d, 0xbf850002, 0x806c846c,
-    0x826d806d, 0xbf06836e, 0xbf840003, 0xbf0c9777, 0xbf840001, 0xbf82004c,
-    0x8972ff72, 0x3e000000, 0x83ee8f6e, 0x8e6e996e, 0x87726e72, 0x92eeff78,
-    0x0001000d, 0x8e6e9d6e, 0x87726e72, 0xbeee007e, 0xbeef007f, 0xbefe00ff,
-    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f,
-    0x866fff7e, 0x000003ff, 0xbefe006e, 0xb8eef803, 0xbf0d9c6e, 0xbf840003,
-    0x876fff6f, 0x00004000, 0xbf820004, 0xbf0d886e, 0xbf840002, 0x876fff6f,
-    0x00008000, 0xbf0d8b6e, 0xbf840002, 0x876fff6f, 0x00002000, 0x866eff6e,
-    0x0000007f, 0xbf840006, 0xb8f3f801, 0x8e6e8c6e, 0x866e736e, 0xbf840002,
-    0x876fff6f, 0x00001000, 0x92eeff72, 0x00040019, 0xbf06826e, 0xbf840002,
-    0x876fff6f, 0x00000400, 0x896eff6f, 0x000003ff, 0xbf850002, 0x876fff6f,
-    0x00000800, 0xbeee007c, 0xbefc006f, 0xbf800000, 0xbf900001, 0xbefc006e,
-    0xbef3006c, 0x866dff6d, 0x0000ffff, 0x8e6d876d, 0x8977ff77, 0x007fff80,
-    0x87776d77, 0xbeec1c00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820002,
-    0xbf920002, 0xbf82fffe, 0xbef21a9e, 0xbef81a8d, 0x8f6e8b77, 0x866eff6e,
-    0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c,
-};
-
-static const unsigned int kCodeTrapHandlerV2_1010[] = {
-    0x93eeff6d, 0x00080010, 0xbf840009, 0xbf0d986d, 0xbf850002, 0x806c846c,
-    0x826d806d, 0xbf06836e, 0xbf840003, 0xbf0c9777, 0xbf840001, 0xbf82004c,
-    0x8a72ff72, 0x3e000000, 0x83ee8f6e, 0x8f6e996e, 0x88726e72, 0x93eeff78,
-    0x0001000d, 0x8f6e9d6e, 0x88726e72, 0xbeee037e, 0xbeef037f, 0xbefe03ff,
-    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f,
-    0x876fff7e, 0x000003ff, 0xbefe036e, 0xb96ef803, 0xbf0d9c6e, 0xbf840003,
-    0x886fff6f, 0x00004000, 0xbf820004, 0xbf0d886e, 0xbf840002, 0x886fff6f,
-    0x00008000, 0xbf0d8b6e, 0xbf840002, 0x886fff6f, 0x00002000, 0x876eff6e,
-    0x0000007f, 0xbf840006, 0xb973f801, 0x8f6e8c6e, 0x876e736e, 0xbf840002,
-    0x886fff6f, 0x00001000, 0x93eeff72, 0x00040019, 0xbf06826e, 0xbf840002,
-    0x886fff6f, 0x00000400, 0x8a6eff6f, 0x000003ff, 0xbf850002, 0x886fff6f,
-    0x00000800, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, 0xbefc036e,
-    0xbef3036c, 0x876dff6d, 0x0000ffff, 0x8f6d876d, 0x8a77ff77, 0x007fff80,
-    0x88776d77, 0xbeec1f00, 0x806cff6c, 0x00000010, 0x826d806d, 0xbf820002,
-    0xbf920002, 0xbf82fffe, 0xbef21d9e, 0xbef81d8d, 0x906e8977, 0x876fff6e,
-    0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, 0xb9eef807,
-    0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c,
-};
-
-static const unsigned int kCodeTrapHandlerV2_10[] = {
-    0x93eeff6d, 0x00080010, 0xbf840009, 0xbf0d986d, 0xbf850002, 0x806c846c,
-    0x826d806d, 0xbf06836e, 0xbf840003, 0xbf0c9777, 0xbf840001, 0xbf82003f,
-    0x8a72ff72, 0x3e000000, 0x83ee8f6e, 0x8f6e996e, 0x88726e72, 0x93eeff78,
-    0x0001000d, 0x8f6e9d6e, 0x88726e72, 0xbeee037e, 0xbeef037f, 0xbefe03ff,
-    0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f,
-    0x876fff7e, 0x000003ff, 0xbefe036e, 0xb96ef803, 0xbf0d9c6e, 0xbf840003,
-    0x886fff6f, 0x00004000, 0xbf820004, 0xbf0d886e, 0xbf840002, 0x886fff6f,
-    0x00008000, 0xbf0d8b6e, 0xbf840002, 0x886fff6f, 0x00002000, 0x876eff6e,
-    0x0000007f, 0xbf840006, 0xb973f801, 0x8f6e8c6e, 0x876e736e, 0xbf840002,
-    0x886fff6f, 0x00001000, 0x93eeff72, 0x00040019, 0xbf06826e, 0xbf840002,
-    0x886fff6f, 0x00000400, 0x8a6eff6f, 0x000003ff, 0xbf850002, 0x886fff6f,
-    0x00000800, 0xbeee037c, 0xbefc036f, 0xbf800000, 0xbf900001, 0xbefc036e,
-    0xbf820000, 0xbef21d9e, 0xbef81d8d, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802,
-    0xbe80226c,
-};
-
-
-}  // namespace amd
-}  // namespace rocr
-
-#endif  // header guard
diff --git a/src/core/inc/amd_hsa_code.hpp b/src/core/inc/amd_hsa_code.hpp
index 724100826..08a898cc6 100644
--- a/src/core/inc/amd_hsa_code.hpp
+++ b/src/core/inc/amd_hsa_code.hpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -270,7 +270,7 @@ namespace code {
       void AddNoteProducerOptions(int32_t call_convention, const hsa_ext_control_directives_t& user_directives, const std::string& user_options);
       bool GetNoteProducerOptions(std::string& options);
 
-      bool GetIsa(std::string& isaName);
+      bool GetIsa(std::string& isaName, unsigned *genericVersion = nullptr);
       bool GetCodeObjectVersion(uint32_t* major, uint32_t* minor);
       hsa_status_t GetInfo(hsa_code_object_info_t attribute, void *value);
       hsa_status_t GetSymbol(const char *module_name, const char *symbol_name, hsa_code_symbol_t *sym);
diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp
index f30047d0b..c63b7a961 100644
--- a/src/core/inc/amd_hsa_loader.hpp
+++ b/src/core/inc/amd_hsa_loader.hpp
@@ -50,6 +50,7 @@
 #include "inc/hsa_ven_amd_loader.h"
 #include "inc/amd_hsa_elf.h"
 #include <string>
+#include <memory>
 #include <mutex>
 #include <vector>
 
@@ -162,8 +163,12 @@ class Context {
 
   virtual hsa_isa_t IsaFromName(const char *name) = 0;
 
+  // This function will be deleted in a future patch. Use the overload
+  // that takes a generic version instead.
   virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) = 0;
 
+  virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa, unsigned genericVersion) { return IsaSupportedByAgent(agent, isa); }
+
   virtual void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) = 0;
 
   virtual bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) = 0;
@@ -453,6 +458,13 @@ class Loader {
       const char *options,
       hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) = 0;
 
+  /// @brief Creates empty AMD HSA Executable with specified @p profile,
+  /// @p options and @p isolated_context that is isolated from the runtime.
+  virtual Executable* CreateExecutable(
+      std::unique_ptr<Context> isolated_context,
+      hsa_profile_t profile,
+      const char *options,
+      hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) = 0;
 
   /// @brief Freezes @p executable
   virtual hsa_status_t FreezeExecutable(Executable *executable, const char *options) = 0;
diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h
index da33a655c..adc2d1645 100644
--- a/src/core/inc/amd_memory_region.h
+++ b/src/core/inc/amd_memory_region.h
@@ -96,7 +96,7 @@ class MemoryRegion : public core::MemoryRegion {
   static void MakeKfdMemoryUnresident(const void* ptr);
 
   MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
-               core::Agent* owner, const HsaMemoryProperties& mem_props);
+               bool user_visible, core::Agent* owner, const HsaMemoryProperties& mem_props);
 
   ~MemoryRegion();
 
@@ -193,7 +193,7 @@ class MemoryRegion : public core::MemoryRegion {
   // fragments of the block routing to the same MemoryRegion.
   mutable KernelMutex access_lock_;
 
-  static const size_t kPageSize_ = 4096;
+  static size_t kPageSize_;
 
   // Determine access type allowed to requesting device
   hsa_amd_memory_pool_access_t GetAccessInfo(const core::Agent& agent,
diff --git a/src/core/inc/checked.h b/src/core/inc/checked.h
index 93793bcc7..56497d120 100644
--- a/src/core/inc/checked.h
+++ b/src/core/inc/checked.h
@@ -58,7 +58,7 @@ template <uint64_t code, bool multiProcess = false> class Check final {
   Check(const Check&) { object_ = uintptr_t(this) ^ uintptr_t(code); }
   Check(Check&&) { object_ = uintptr_t(this) ^ uintptr_t(code); }
 
-  ~Check() { object_ = NULL; }
+  ~Check() { object_ = uintptr_t(NULL); }
 
   const Check& operator=(Check&& rhs) { return *this; }
   const Check& operator=(const Check& rhs) { return *this; }
diff --git a/src/core/inc/driver.h b/src/core/inc/driver.h
new file mode 100644
index 000000000..c6b7ffac1
--- /dev/null
+++ b/src/core/inc/driver.h
@@ -0,0 +1,109 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTME_CORE_INC_DRIVER_H_
+#define HSA_RUNTME_CORE_INC_DRIVER_H_
+
+#include <string>
+
+#include "core/inc/agent.h"
+#include "core/inc/memory_region.h"
+#include "inc/hsa.h"
+
+namespace rocr {
+namespace core {
+
+using MemFlags = uint32_t;
+
+struct MemProperties {
+  MemFlags flags_;
+  size_t size_bytes_;
+  uint64_t virtual_base_addr_;
+};
+
+/// @brief Kernel driver interface.
+///
+/// @details A class used to provide an interface between the core runtime
+/// and agent kernel drivers. It also maintains state associated with active
+/// kernel drivers.
+class Driver {
+ public:
+  Driver() = delete;
+  Driver(const std::string devnode_name, Agent::DeviceType agent_device_type);
+  virtual ~Driver() {}
+
+  /// @brief Open a connection to the driver using name_.
+  /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully.
+  hsa_status_t Open();
+  /// @brief Close a connection to the open driver using fd_.
+  /// @retval HSA_STATUS_SUCCESS if the driver was opened successfully.
+  hsa_status_t Close();
+
+  virtual hsa_status_t GetMemoryProperties(uint32_t node_id, MemProperties &mprops) const = 0;
+
+  /// @brief Allocate agent-accessible memory (system or agent-local memory).
+  ///
+  /// @param[out] pointer to newly allocated memory.
+  ///
+  /// @retval HSA_STATUS_SUCCESS if memory was successfully allocated or
+  /// hsa_status_t error code if the memory allocation failed.
+  virtual hsa_status_t AllocateMemory(void** mem, size_t size, uint32_t node_id,
+                                      MemFlags flags) = 0;
+
+  virtual hsa_status_t FreeMemory(void* mem, uint32_t node_id) = 0;
+
+  virtual hsa_status_t CreateQueue(Queue &queue) = 0;
+
+  virtual hsa_status_t DestroyQueue(Queue &queue) const = 0;
+
+  /// Specify the agent device type this driver is for.
+  const Agent::DeviceType agent_device_type_;
+
+ protected:
+  const std::string devnode_name_;
+  int fd_ = -1;
+};
+
+} // namespace core
+} // namespace rocr
+
+#endif // header guard
diff --git a/src/core/inc/host_queue.h b/src/core/inc/host_queue.h
index 8521aed7b..ce0bfbbcc 100644
--- a/src/core/inc/host_queue.h
+++ b/src/core/inc/host_queue.h
@@ -152,10 +152,18 @@ class HostQueue : public Queue {
     return HSA_STATUS_ERROR_INVALID_QUEUE;
   }
 
-  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
+  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b,
+                  hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE,
+                  hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
+                  hsa_signal_t* signal = NULL) override {
     assert(false && "HostQueue::ExecutePM4 is unimplemented");
   }
 
+  hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override {
+    assert(false && "HostQueue::GetInfo is unimplemented");
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
   void* operator new(size_t size) {
     return _aligned_malloc(size, HSA_QUEUE_ALIGN_BYTES);
   }
diff --git a/src/core/inc/hsa_api_trace_int.h b/src/core/inc/hsa_api_trace_int.h
index f270efb72..e61e8a5db 100644
--- a/src/core/inc/hsa_api_trace_int.h
+++ b/src/core/inc/hsa_api_trace_int.h
@@ -53,6 +53,7 @@ namespace core {
     static const uint32_t HSA_EXT_FINALIZER_API_TABLE_ID = 0;
     static const uint32_t HSA_EXT_IMAGE_API_TABLE_ID = 1;
     static const uint32_t HSA_EXT_AQLPROFILE_API_TABLE_ID = 2;
+    static const uint32_t HSA_EXT_PC_SAMPLING_API_TABLE_ID = 3;
 
     ::HsaApiTable hsa_api;
     ::CoreApiTable core_api;
@@ -60,6 +61,7 @@ namespace core {
     ::FinalizerExtTable finalizer_api;
     ::ImageExtTable image_api;
     ::ToolsApiTable tools_api;
+    ::PcSamplingExtTable pcs_api;
 
     HsaApiTable();
     void Init();
diff --git a/src/core/inc/hsa_ext_amd_impl.h b/src/core/inc/hsa_ext_amd_impl.h
index 19357d2d8..e5717b3fa 100644
--- a/src/core/inc/hsa_ext_amd_impl.h
+++ b/src/core/inc/hsa_ext_amd_impl.h
@@ -302,6 +302,10 @@ hsa_status_t HSA_API hsa_amd_portable_close_dmabuf(int dmabuf);
 hsa_status_t hsa_amd_vmem_address_reserve(void** ptr, size_t size, uint64_t address,
                                           uint64_t flags);
 
+// Mirrors Amd Extension Apis
+hsa_status_t hsa_amd_vmem_address_reserve_align(void** ptr, size_t size, uint64_t address,
+                                          uint64_t alignment, uint64_t flags);
+
 // Mirrors Amd Extension Apis
 hsa_status_t hsa_amd_vmem_address_free(void* ptr, size_t size);
 
@@ -349,6 +353,10 @@ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(hsa_amd_vmem_alloc_ha
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold);
 
+// Mirrors Amd Extension Apis
+hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
+                                    void* value);
+
 }  // namespace amd
 }  // namespace rocr
 
diff --git a/src/core/inc/hsa_ext_interface.h b/src/core/inc/hsa_ext_interface.h
index 20a51759f..c6b275b1e 100644
--- a/src/core/inc/hsa_ext_interface.h
+++ b/src/core/inc/hsa_ext_interface.h
@@ -57,12 +57,17 @@ struct ImageExtTableInternal : public ImageExtTable {
   decltype(::hsa_amd_image_get_info_max_dim)* hsa_amd_image_get_info_max_dim_fn;
 };
 
+struct PcSamplingExtTableInternal : public PcSamplingExtTable {};
+
 class ExtensionEntryPoints {
  public:
 
   // Table of function pointers for Hsa Extension Image
   ImageExtTableInternal image_api;
 
+  // Table of function pointers for Hsa vendor PC Sampling
+  PcSamplingExtTableInternal pcs_api;
+
   // Table of function pointers for Hsa Extension Finalizer
   FinalizerExtTable finalizer_api;
 
@@ -77,6 +82,12 @@ class ExtensionEntryPoints {
   // Reset Api tables to point to null implementations
   void UnloadImage();
 
+  // Update PC Sampling Api table with handles to implementation
+  void LoadPcSampling();
+
+  // Reset PC Sampling tables to point to null implementations
+  void UnloadPcSampling();
+
  private:
   typedef void (*Load_t)(const ::HsaApiTable* table);
   typedef void (*Unload_t)();
@@ -89,6 +100,9 @@ class ExtensionEntryPoints {
   // Initialize table for HSA Image Extension Api's
   void InitImageExtTable();
 
+  // Initialize table for HSA PC Sampling Extension Api's
+  void InitPcSamplingExtTable();
+
   // Initialize Amd Ext table for Api related to Images
   void InitAmdExtTable();
 
@@ -96,7 +110,7 @@ class ExtensionEntryPoints {
   void UpdateAmdExtTable(decltype(::hsa_amd_image_create)* func_ptr);
 
   DISALLOW_COPY_AND_ASSIGN(ExtensionEntryPoints);
-};  
+};
 }   //  namespace core
 }   //  namespace rocr
 
diff --git a/src/core/inc/intercept_queue.h b/src/core/inc/intercept_queue.h
index e7784ffaa..8088d5e92 100644
--- a/src/core/inc/intercept_queue.h
+++ b/src/core/inc/intercept_queue.h
@@ -120,8 +120,11 @@ class QueueWrapper : public Queue {
   hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) override {
     return wrapped->GetCUMasking(num_cu_mask_count, cu_mask);
   }
-  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override {
-    wrapped->ExecutePM4(cmd_data, cmd_size_b);
+  void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b,
+                  hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE,
+                  hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
+                  hsa_signal_t* signal = NULL) override {
+    wrapped->ExecutePM4(cmd_data, cmd_size_b, acquireFence, releaseFence, signal);
   }
   void SetProfiling(bool enabled) override { wrapped->SetProfiling(enabled); }
 
@@ -266,6 +269,9 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSi
     StoreRelaxed(value);
   }
 
+  /// @brief Provide information about the queue
+  hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) override;
+
   static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id_); }
   static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); }
 
diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h
index f36b195e7..66acf3636 100644
--- a/src/core/inc/memory_region.h
+++ b/src/core/inc/memory_region.h
@@ -58,11 +58,12 @@ class Agent;
 class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
  public:
   MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
-               core::Agent* owner)
+               bool user_visible, core::Agent* owner)
       : fine_grain_(fine_grain),
         kernarg_(kernarg),
         full_profile_(full_profile),
         extended_scope_fine_grain_(extended_scope_fine_grain),
+        user_visible_(user_visible),
         owner_(owner) {
     assert(owner_ != NULL);
   }
@@ -103,6 +104,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
     // Note: The node_id needs to be the node_id of the device even though this is allocating
     // system memory
     AllocateGTTAccess = (1 << 9),
+    AllocateContiguous = (1 << 10),  // Physically contiguous memory
   };
 
   typedef uint32_t AllocateFlags;
@@ -132,6 +134,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
 
   __forceinline bool full_profile() const { return full_profile_; }
 
+  __forceinline bool user_visible() const { return user_visible_; }
+
   __forceinline core::Agent* owner() const { return owner_; }
 
  private:
@@ -139,6 +143,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
   const bool kernarg_;
   const bool full_profile_;
   const bool extended_scope_fine_grain_;
+  const bool user_visible_;
+
   core::Agent* owner_;
 };
 }  // namespace core
diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h
index 112510234..75a291a66 100644
--- a/src/core/inc/queue.h
+++ b/src/core/inc/queue.h
@@ -52,7 +52,7 @@
 #include "core/inc/memory_region.h"
 #include "core/util/utils.h"
 #include "inc/amd_hsa_queue.h"
-
+#include "inc/hsa_ext_amd.h"
 #include "hsakmt/hsakmt.h"
 
 namespace rocr {
@@ -346,14 +346,33 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
   /// @return hsa_status_t
   virtual hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t* cu_mask) = 0;
 
-  // @brief Submits a block of PM4 and waits until it has been executed.
-  virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) = 0;
+  /// @brief Submits a block of PM4.
+  ///
+  /// @param cmd_data pointer to command buffer
+  ///
+  /// @param cmd_size_b command buffer size in bytes
+  ///
+  /// @param acquireFence acquire-fence type
+  ///
+  /// @param releaseFence acquire-fence type
+  ///
+  /// @param signal optional wait signal
+  ///
+  /// if @p signal is provided, function will return without waiting for commands to be executed
+  /// if @p signal is NULL, waits until commands have been executed.
+  virtual void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b,
+                          hsa_fence_scope_t acquireFence = HSA_FENCE_SCOPE_NONE,
+                          hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
+                          hsa_signal_t* signal = NULL) = 0;
 
   virtual void SetProfiling(bool enabled) {
     AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING,
                      (enabled != 0));
   }
 
+  /// @ brief Returns queue queries about the queue
+  virtual hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void* value) = 0;
+
   /// @ brief Reports async queue errors to stderr if no other error handler was registered.
   static void DefaultErrorHandler(hsa_status_t status, hsa_queue_t* source, void* data);
 
diff --git a/src/core/inc/registers.h b/src/core/inc/registers.h
index b8ba6aa5d..2dc30068d 100644
--- a/src/core/inc/registers.h
+++ b/src/core/inc/registers.h
@@ -134,6 +134,23 @@ SQ_SEL_W                                 = 0x00000007,
           float f32All;
         };
 
+        union COMPUTE_TMPRING_SIZE_GFX12 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int WAVES : 12;
+            unsigned int WAVESIZE : 18;
+            unsigned int : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int : 2;
+            unsigned int WAVESIZE : 18;
+            unsigned int WAVES : 12;
+#endif
+          } bitfields, bits;
+          unsigned int u32All;
+          signed int i32All;
+          float f32All;
+        };
+
 
         union SQ_BUF_RSRC_WORD0 {
 	struct {
@@ -301,6 +318,44 @@ SQ_SEL_W                                 = 0x00000007,
             unsigned int DST_SEL_Z : 3;
             unsigned int DST_SEL_Y : 3;
             unsigned int DST_SEL_X : 3;
+#endif
+          } bitfields, bits;
+        unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+        };
+
+        // From V# Table
+        union SQ_BUF_RSRC_WORD3_GFX12 {
+          struct {
+#if defined(LITTLEENDIAN_CPU)
+            unsigned int DST_SEL_X : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_W : 3;
+            unsigned int FORMAT : 6;
+            unsigned int RESERVED1 : 3;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int WRITE_COMPRESS_ENABLE : 1;
+            unsigned int COMPRESSION_EN : 1;
+            unsigned int COMPRESSION_ACCESS_MODE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int TYPE : 2;
+#elif defined(BIGENDIAN_CPU)
+            unsigned int TYPE : 2;
+            unsigned int OOB_SELECT : 2;
+            unsigned int COMPRESSION_ACCESS_MODE : 2;
+            unsigned int COMPRESSION_EN : 1;
+            unsigned int WRITE_COMPRESS_ENABLE : 1;
+            unsigned int ADD_TID_ENABLE : 1;
+            unsigned int INDEX_STRIDE : 2;
+            unsigned int RESERVED1 : 3;
+            unsigned int FORMAT : 6;
+            unsigned int DST_SEL_W : 3;
+            unsigned int DST_SEL_Z : 3;
+            unsigned int DST_SEL_Y : 3;
+            unsigned int DST_SEL_X : 3;
 #endif
           } bitfields, bits;
         unsigned int	u32All;
diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
index 6f5997f7e..3d4d889d8 100644
--- a/src/core/inc/runtime.h
+++ b/src/core/inc/runtime.h
@@ -359,7 +359,7 @@ class Runtime {
 
   hsa_status_t DmaBufClose(int dmabuf);
 
-  hsa_status_t VMemoryAddressReserve(void** ptr, size_t size, uint64_t address, uint64_t flags);
+  hsa_status_t VMemoryAddressReserve(void** ptr, size_t size, uint64_t address, uint64_t alignment, uint64_t flags);
 
   hsa_status_t VMemoryAddressFree(void* ptr, size_t size);
 
diff --git a/src/core/inc/sdma_registers.h b/src/core/inc/sdma_registers.h
index d94eed43f..7a26b7350 100644
--- a/src/core/inc/sdma_registers.h
+++ b/src/core/inc/sdma_registers.h
@@ -130,7 +130,7 @@ typedef struct SDMA_PKT_COPY_LINEAR_TAG {
   static const size_t kMaxSize_ = 0x3fffe0;
 } SDMA_PKT_COPY_LINEAR;
 
-// linear sub-window
+// linear sub-window (pre-GFX12)
 typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
   static const unsigned int pitch_bits = 19;
   static const unsigned int slice_bits = 28;
@@ -253,6 +253,121 @@ typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
 
 } SDMA_PKT_COPY_LINEAR_RECT;
 
+// linear sub-window (GFX12)
+typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG_GFX12 {
+  static const unsigned int pitch_bits   = 16;
+  static const unsigned int slice_bits   = 32;
+  static const unsigned int rect_xy_bits = 16;
+  static const unsigned int rect_z_bits  = 14;
+
+  union {
+    struct {
+      unsigned int op       :  8;
+      unsigned int sub_op   :  8;
+      unsigned int reserved : 13;
+      unsigned int element  :  3;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int src_addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } SRC_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int src_addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } SRC_ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int src_offset_x : 16;
+      unsigned int src_offset_y : 16;
+    };
+    unsigned int DW_3_DATA;
+  } SRC_PARAMETER_1_UNION;
+
+  union {
+    struct {
+      unsigned int src_offset_z : 14;
+      unsigned int reserved_1   : 2;
+      unsigned int src_pitch    : pitch_bits;
+    };
+    unsigned int DW_4_DATA;
+  } SRC_PARAMETER_2_UNION;
+
+  union {
+    struct {
+      unsigned int src_slice_pitch : slice_bits;
+    };
+    unsigned int DW_5_DATA;
+  } SRC_PARAMETER_3_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_31_0 : 32;
+    };
+    unsigned int DW_6_DATA;
+  } DST_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_63_32 : 32;
+    };
+    unsigned int DW_7_DATA;
+  } DST_ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int dst_offset_x : 16;
+      unsigned int dst_offset_y : 16;
+    };
+    unsigned int DW_8_DATA;
+  } DST_PARAMETER_1_UNION;
+
+  union {
+    struct {
+      unsigned int dst_offset_z : 14;
+      unsigned int reserved_1   : 2;
+      unsigned int dst_pitch    : pitch_bits;
+    };
+    unsigned int DW_9_DATA;
+  } DST_PARAMETER_2_UNION;
+
+  union {
+    struct {
+      unsigned int dst_slice_pitch : slice_bits;
+    };
+    unsigned int DW_10_DATA;
+  } DST_PARAMETER_3_UNION;
+
+  union {
+    struct {
+      unsigned int rect_x : rect_xy_bits;
+      unsigned int rect_y : rect_xy_bits;
+      };
+    unsigned int DW_11_DATA;
+  } RECT_PARAMETER_1_UNION;
+
+  union {
+    struct {
+      unsigned int rect_z           : rect_z_bits;
+      unsigned int reserved_1       : 6;
+      unsigned int dst_cache_policy : 3;
+      unsigned int reserved_2       : 5;
+      unsigned int src_cache_policy : 3;
+      unsigned int reserved_3       : 1;
+    };
+    unsigned int DW_12_DATA;
+  } RECT_PARAMETER_2_UNION;
+
+} SDMA_PKT_COPY_LINEAR_RECT_GFX12;
+
 typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
   union {
     struct {
diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h
index a1096ddde..39e532186 100644
--- a/src/core/inc/signal.h
+++ b/src/core/inc/signal.h
@@ -407,7 +407,7 @@ class Signal {
   core::Agent* async_copy_agent_;
 
  private:
-  static HybridMutex ipcLock_;
+  static KernelMutex ipcLock_;
   static std::map<decltype(hsa_signal_t::handle), Signal*> ipcMap_;
 
   static Signal* lookupIpc(hsa_signal_t signal);
diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
index c968eec05..4720a6b32 100644
--- a/src/core/runtime/amd_aql_queue.cpp
+++ b/src/core/runtime/amd_aql_queue.cpp
@@ -68,6 +68,7 @@
 #include "core/inc/hsa_ext_amd_impl.h"
 #include "core/inc/amd_gpu_pm4.h"
 #include "core/inc/hsa_amd_tool_int.hpp"
+#include "core/inc/amd_core_dump.hpp"
 
 namespace rocr {
 namespace AMD {
@@ -218,6 +219,12 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
     queue_scratch_.mem_alignment_size = 1024;
 
   queue_scratch_.use_once_limit = core::Runtime::runtime_singleton_->flag().scratch_single_limit();
+  if (queue_scratch_.use_once_limit > agent_->MaxScratchDevice()) {
+    fprintf(stdout, "User specified scratch limit exceeds device limits (requested:%lu max:%lu)!\n",
+                    queue_scratch_.use_once_limit, agent_->MaxScratchDevice());
+    queue_scratch_.use_once_limit = agent_->MaxScratchDevice();
+  }
+
   queue_scratch_.use_alt_limit = 0;
 
   queue_scratch_.async_reclaim = agent_->AsyncScratchReclaimEnabled();
@@ -358,13 +365,17 @@ AqlQueue::~AqlQueue() {
   // Remove kfd exception handler
   exceptionState |= ERROR_HANDLER_TERMINATE;
   while ((exceptionState & ERROR_HANDLER_DONE) != ERROR_HANDLER_DONE) {
+    const uint64_t timeout_ms = 5000;
+
     exception_signal_->StoreRelease(-1ull);
-    exception_signal_->WaitRelaxed(HSA_SIGNAL_CONDITION_NE, -1ull, -1ull, HSA_WAIT_STATE_BLOCKED);
+    exception_signal_->WaitRelaxed(HSA_SIGNAL_CONDITION_NE, -1ull, timeout_ms,
+                                   HSA_WAIT_STATE_BLOCKED);
   }
 
   Inactivate();
-  agent_->ReleaseQueueMainScratch(queue_scratch_);
-  agent_->ReleaseQueueAltScratch(queue_scratch_);
+
+  if (queue_scratch_.main_queue_base) agent_->ReleaseQueueMainScratch(queue_scratch_);
+  if (queue_scratch_.alt_queue_base) agent_->ReleaseQueueAltScratch(queue_scratch_);
 
   FreeRegisteredRingBuffer();
   exception_signal_->DestroySignal();
@@ -527,6 +538,25 @@ void AqlQueue::StoreRelease(hsa_signal_value_t value) {
   StoreRelaxed(value);
 }
 
+hsa_status_t AqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) {
+  switch (attribute) {
+    case HSA_AMD_QUEUE_INFO_AGENT:
+      *(reinterpret_cast<hsa_agent_t*>(value)) = agent_->public_handle();
+      break;
+    case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
+      if (doorbell_type_ == 2)
+        // Hardware doorbell supports AQL semantics.
+        *(reinterpret_cast<uint64_t*>(value)) =
+            reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
+      else
+        return HSA_STATUS_ERROR_INVALID_QUEUE;
+      break;
+    default:
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
 uint32_t AqlQueue::ComputeRingBufferMinPkts() {
   // From CP_HQD_PQ_CONTROL.QUEUE_SIZE specification:
   //   Size of the primary queue (PQ) will be: 2^(HQD_QUEUE_SIZE+1) DWs.
@@ -1260,6 +1290,21 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
     return false;
   }
 
+  // Fallback if KFD does not support GPU core dump. In this case, there core dump is
+  // generated by hsa-runtime.
+  if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump &&
+      queue->agent_->isa()->GetMajorVersion() != 11) {
+
+    if (pcs::PcsRuntime::instance()->SessionsActive())
+      fprintf(stderr, "GPU core dump skipped because PC Sampling active\n");
+    else if (amd::coredump::dump_gpu_core())
+      fprintf(stderr, "GPU core dump failed\n");
+    // supports_core_dump flag is overwritten to avoid generate core dump file again
+    // caught by a different exception handler. Such as VMFaultHandler.
+    core::Runtime::runtime_singleton_->KfdVersion(
+      core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true);
+  }
+
   for (auto& error : QueueErrors) {
     if (error_code & (1 << (error.code - 1))) {
       errorCode = error.status;
@@ -1375,7 +1420,12 @@ void AqlQueue::SetProfiling(bool enabled) {
   return;
 }
 
-void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
+// If in_signal is NULL then this ExecutePM4 will block and wait for PM4 commands to complete
+// If in_signal is provided, then ExecutePM4 will return and caller may wait for in_signal
+// Note: On gfx8, there is no completion signal support, so ExecutePM4 will block even if
+// in_signal is provided, and it is still valid to check in_signal after ExecutePM4 returns.
+void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope_t acquireFence,
+                          hsa_fence_scope_t releaseFence, hsa_signal_t* in_signal) {
   // pm4_ib_buf_ is a shared resource, so mutually exclude here.
   ScopedAcquire<KernelMutex> lock(&pm4_ib_mutex_);
 
@@ -1411,7 +1461,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
   // To respect multi-producer semantics, first buffer commands for the queue slot.
   constexpr uint32_t slot_size_dw = uint32_t(slot_size_b / sizeof(uint32_t));
   uint32_t slot_data[slot_size_dw];
-  hsa_signal_t signal = {0};
+  hsa_signal_t local_signal = {0};
   hsa_status_t err;
 
   if (agent_->isa()->GetMajorVersion() <= 8) {
@@ -1456,28 +1506,32 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
     // Construct an AQL packet to jump to the PM4 IB.
     struct amd_aql_pm4_ib {
       uint16_t header;
-      uint8_t  amd_format;
-      uint8_t  reserved0;
+      uint16_t ven_hdr;
       uint32_t ib_jump_cmd[4];
       uint32_t dw_cnt_remain;
-      uint32_t reserved1[8];
+      uint32_t reserved[8];
       hsa_signal_t completion_signal;
     };
 
-    constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1;
+    if (!in_signal) {
+      err = hsa_signal_create(1, 0, NULL, &local_signal);
+      assert(err == HSA_STATUS_SUCCESS);
+    }
 
-    err = hsa_signal_create(1, 0, NULL, &signal);
-    assert(err == HSA_STATUS_SUCCESS);
+    constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1;
 
     amd_aql_pm4_ib aql_pm4_ib{};
-    aql_pm4_ib.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
-    aql_pm4_ib.amd_format = AMD_AQL_FORMAT_PM4_IB;
+    aql_pm4_ib.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE |
+                        (acquireFence << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+                        (releaseFence << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+
+    aql_pm4_ib.ven_hdr = AMD_AQL_FORMAT_PM4_IB;
     aql_pm4_ib.ib_jump_cmd[0] = ib_jump_cmd[0];
     aql_pm4_ib.ib_jump_cmd[1] = ib_jump_cmd[1];
     aql_pm4_ib.ib_jump_cmd[2] = ib_jump_cmd[2];
     aql_pm4_ib.ib_jump_cmd[3] = ib_jump_cmd[3];
     aql_pm4_ib.dw_cnt_remain = 0xA;
-    aql_pm4_ib.completion_signal = signal;
+    aql_pm4_ib.completion_signal = in_signal ? *in_signal : local_signal;
 
     memcpy(slot_data, &aql_pm4_ib, sizeof(aql_pm4_ib));
   } else {
@@ -1498,11 +1552,14 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
   if (agent_->isa()->GetMajorVersion() <= 8) {
     while (queue->LoadReadIndexRelaxed() <= write_idx)
       os::YieldThread();
-  } else {
+
+    if (in_signal) hsa_signal_store_screlease(*in_signal, 0);
+  } else if (!in_signal) {
+    // On gfx9 and newer, if in_signal is not provided, we block and wait for own signal
     hsa_signal_value_t ret;
-    ret = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
-                                    (uint64_t)-1, HSA_WAIT_STATE_ACTIVE);
-    err = hsa_signal_destroy(signal);
+    ret = hsa_signal_wait_scacquire(local_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
+                                    HSA_WAIT_STATE_ACTIVE);
+    err = hsa_signal_destroy(local_signal);
     assert(ret == 0 && err == HSA_STATUS_SUCCESS);
   }
 }
@@ -1617,6 +1674,26 @@ void AqlQueue::FillBufRsrcWord3_Gfx11() {
   amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
 }
 
+void AqlQueue::FillBufRsrcWord3_Gfx12() {
+  SQ_BUF_RSRC_WORD3_GFX12 srd3;
+
+  srd3.bits.DST_SEL_X = SQ_SEL_X;
+  srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+  srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+  srd3.bits.DST_SEL_W = SQ_SEL_W;
+  srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+  srd3.bits.RESERVED1 = 0;
+  srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+  srd3.bits.ADD_TID_ENABLE = 1;
+  srd3.bits.WRITE_COMPRESS_ENABLE = 0;
+  srd3.bits.COMPRESSION_EN = 0;
+  srd3.bits.COMPRESSION_ACCESS_MODE = 0;
+  srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+  srd3.bits.TYPE = SQ_RSRC_BUF;
+
+  amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
+}
+
 // Set concurrent wavefront limits only when scratch is being used.
 void AqlQueue::FillComputeTmpRingSize() {
   COMPUTE_TMPRING_SIZE tmpring_size = {};
@@ -1715,10 +1792,52 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() {
   amd_queue_.compute_tmpring_size = tmpring_size.u32All;
 }
 
+// Set concurrent wavefront limits only when scratch is being used.
+void AqlQueue::FillComputeTmpRingSize_Gfx12() {
+  // For GFX12, struct field size changes.
+  // Consider refactoring code for GFX11/GFX12 if no other changes.
+  COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {};
+  if (queue_scratch_.main_size == 0) {
+    amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+    return;
+  }
+
+  const auto& agent_props = agent_->properties();
+  const uint32_t num_xcc = agent_props.NumXcc;
+
+  // Determine the maximum number of waves device can support
+  uint32_t num_cus = agent_props.NumFComputeCores / (agent_props.NumSIMDPerCU * num_xcc);
+  uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;
+
+  // Scratch is allocated program COMPUTE_TMPRING_SIZE register
+  // Scratch Size per Wave is specified in terms of kilobytes
+  uint32_t wave_scratch = (((queue_scratch_.main_lanes_per_wave * queue_scratch_.main_size_per_thread) +
+                            queue_scratch_.mem_alignment_size - 1) /
+                           queue_scratch_.mem_alignment_size);
+
+  tmpring_size.bits.WAVESIZE = wave_scratch;
+  assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
+
+  uint32_t num_waves =
+      queue_scratch_.main_size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
+
+  // For GFX11 we specify number of waves per engine instead of total
+  num_waves /= agent_->properties().NumShaderBanks;
+  tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
+  amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+}
+
 // @brief Define the Scratch Buffer Descriptor and related parameters
 // that enable kernel access scratch memory
 void AqlQueue::InitScratchSRD() {
   switch (agent_->isa()->GetMajorVersion()) {
+    case 12:
+      FillBufRsrcWord0();
+      FillBufRsrcWord1_Gfx11();
+      FillBufRsrcWord2();
+      FillBufRsrcWord3_Gfx12();
+      FillComputeTmpRingSize_Gfx12();
+      break;
     case 11:
       FillBufRsrcWord0();
       FillBufRsrcWord1_Gfx11();
diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp
index b39f86461..fe459e13d 100644
--- a/src/core/runtime/amd_blit_sdma.cpp
+++ b/src/core/runtime/amd_blit_sdma.cpp
@@ -122,6 +122,8 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BlitSdma()
       cached_commit_index_(0),
       platform_atomic_support_(true),
       hdp_flush_support_(false),
+      gang_leader_(false),
+      is_ganged_(false),
       min_submission_size_(0) {
   std::memset(&queue_resource_, 0, sizeof(queue_resource_));
 }
@@ -551,7 +553,13 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCopyRe
   if (range->z > 1 && (src->slice == 0 || dst->slice == 0))
     throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed.");
 
-  const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
+  // GFX12 or later use a different packet format that is incompatible (fields changed in size and location).
+  const bool isGFX12Plus = (agent_->isa()->GetMajorVersion() >= 12);
+
+  // Common and GFX12 packet must match in size to use same code for vector/append.
+  static_assert(sizeof(SDMA_PKT_COPY_LINEAR_RECT) == sizeof(SDMA_PKT_COPY_LINEAR_RECT_GFX12));
+
+  const uint max_pitch = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::pitch_bits : SDMA_PKT_COPY_LINEAR_RECT::pitch_bits);
 
   std::vector<SDMA_PKT_COPY_LINEAR_RECT> pkts;
   std::vector<uint64_t> bytes_moved;
@@ -836,12 +844,15 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCo
     return __builtin_ctz(width | 16);
   };
 
+  // GFX12 or later use a different packet format that is incompatible (fields changed in size and location).
+  const bool isGFX12Plus = (agent_->isa()->GetMajorVersion() >= 12);
+
   // Limits in terms of element count
-  const uint32_t max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
-  const uint32_t max_slice = 1 << SDMA_PKT_COPY_LINEAR_RECT::slice_bits;
-  const uint32_t max_x = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
-  const uint32_t max_y = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
-  const uint32_t max_z = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits;
+  const uint32_t max_pitch = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::pitch_bits   : SDMA_PKT_COPY_LINEAR_RECT::pitch_bits);
+  const uint32_t max_slice = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::slice_bits   : SDMA_PKT_COPY_LINEAR_RECT::slice_bits);
+  const uint32_t max_x     = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::rect_xy_bits : SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits);
+  const uint32_t max_y     = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::rect_xy_bits : SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits);
+  const uint32_t max_z     = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::rect_z_bits  : SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits);
 
   // Find maximum element that describes the pitch and slice.
   // Pitch and slice must both be represented in units of elements.  No element larger than this
@@ -916,27 +927,52 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCo
 
         x += xcount << element;
 
-        SDMA_PKT_COPY_LINEAR_RECT* pkt =
+        // GFX12 has a different packet format that is incompatible with pre-GFX12.
+        if (isGFX12Plus) {
+          SDMA_PKT_COPY_LINEAR_RECT_GFX12* pkt =
+            (SDMA_PKT_COPY_LINEAR_RECT_GFX12*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT));
+          *pkt = {};
+          pkt->HEADER_UNION.op = SDMA_OP_COPY;
+          pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT;
+          pkt->HEADER_UNION.element = element;
+          pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase;
+          pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32;
+          pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff;
+          pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1;
+          pkt->SRC_PARAMETER_3_UNION.src_slice_pitch =
+            (range->z == 1) ? 0 : (src->slice >> element) - 1;
+          pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase;
+          pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32;
+          pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff;
+          pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1;
+          pkt->DST_PARAMETER_3_UNION.dst_slice_pitch =
+            (range->z == 1) ? 0 : (dst->slice >> element) - 1;
+          pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1;
+          pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1;
+          pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1;
+        } else {  // Pre-GFX12, common packet used
+          SDMA_PKT_COPY_LINEAR_RECT* pkt =
             (SDMA_PKT_COPY_LINEAR_RECT*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT));
-        *pkt = {};
-        pkt->HEADER_UNION.op = SDMA_OP_COPY;
-        pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT;
-        pkt->HEADER_UNION.element = element;
-        pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase;
-        pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32;
-        pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff;
-        pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1;
-        pkt->SRC_PARAMETER_3_UNION.src_slice_pitch =
+          *pkt = {};
+          pkt->HEADER_UNION.op = SDMA_OP_COPY;
+          pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT;
+          pkt->HEADER_UNION.element = element;
+          pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase;
+          pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32;
+          pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff;
+          pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1;
+          pkt->SRC_PARAMETER_3_UNION.src_slice_pitch =
             (range->z == 1) ? 0 : (src->slice >> element) - 1;
-        pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase;
-        pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32;
-        pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff;
-        pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1;
-        pkt->DST_PARAMETER_3_UNION.dst_slice_pitch =
+          pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase;
+          pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32;
+          pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff;
+          pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1;
+          pkt->DST_PARAMETER_3_UNION.dst_slice_pitch =
             (range->z == 1) ? 0 : (dst->slice >> element) - 1;
-        pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1;
-        pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1;
-        pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1;
+          pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1;
+          pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1;
+          pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1;
+	}
       }
     }
   }
diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp
index 7e68c7d23..df473d421 100644
--- a/src/core/runtime/amd_cpu_agent.cpp
+++ b/src/core/runtime/amd_cpu_agent.cpp
@@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() {
     if (system_prop != mem_props.end()) system_props = *system_prop;
 
     MemoryRegion* system_region_fine =
-        new MemoryRegion(true, false, is_apu_node, false, this, system_props);
+        new MemoryRegion(true, false, is_apu_node, false, true, this, system_props);
     regions_.push_back(system_region_fine);
     MemoryRegion* system_region_kernarg =
-        new MemoryRegion(true, true, is_apu_node, false, this, system_props);
+        new MemoryRegion(true, true, is_apu_node, false, true, this, system_props);
     regions_.push_back(system_region_kernarg);
 
     if (!is_apu_node) {
       MemoryRegion* system_region_coarse =
-          new MemoryRegion(false, false, is_apu_node, false, this, system_props);
+          new MemoryRegion(false, false, is_apu_node, false, true, this, system_props);
       regions_.push_back(system_region_coarse);
     }
   }
@@ -152,6 +152,7 @@ hsa_status_t CpuAgent::VisitRegion(
     hsa_status_t (*callback)(hsa_region_t region, void* data),
     void* data) const {
   for (const core::MemoryRegion* region : regions) {
+    if (!region->user_visible()) continue;
     hsa_region_t region_handle = core::MemoryRegion::Convert(region);
     hsa_status_t status = callback(region_handle, data);
     if (status != HSA_STATUS_SUCCESS) {
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index d7f6a4fdb..ef2ddfb4c 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -65,6 +65,7 @@
 #include "core/util/os.h"
 #include "inc/hsa_ext_image.h"
 #include "inc/hsa_ven_amd_aqlprofile.h"
+#include "inc/hsa_ven_amd_pc_sampling.h"
 
 #include "core/inc/amd_trap_handler_v1.h"
 #include "core/inc/amd_blit_shaders.h"
@@ -83,7 +84,6 @@
 #define DEFAULT_SCRATCH_BYTES_PER_THREAD 2048
 #define MAX_WAVE_SCRATCH 8387584  // See COMPUTE_TMPRING_SIZE.WAVESIZE
 #define MAX_NUM_DOORBELLS 0x400
-#define MAX_SCRATCH_APERTURE_PER_XCC 4294967296
 #define DEFAULT_SCRATCH_SINGLE_LIMIT_ASYNC_PER_XCC (1 << 30)  // 1 GB
 
 namespace rocr {
@@ -92,6 +92,8 @@ extern HsaApiTable hsa_internal_api_table_;
 } // namespace core
 
 namespace AMD {
+const uint64_t CP_DMA_DATA_TRANSFER_CNT_MAX = (1 << 26);
+
 GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode,
                    uint32_t index)
     : GpuAgentInt(node),
@@ -113,7 +115,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
       sdma_blit_used_mask_(0),
       scratch_limit_async_threshold_(0),
       scratch_cache_(
-          [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
+          [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }),
+      trap_handler_tma_region_(NULL),
+      pcs_hosttrap_data_() {
   const bool is_apu_node = (properties_.NumCPUCores > 0);
   profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE;
 
@@ -259,68 +263,76 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
     ASICShader compute_1010;
     ASICShader compute_10;
     ASICShader compute_11;
+    ASICShader compute_12;
   };
 
   std::map<std::string, CompiledShader> compiled_shaders = {
       {"TrapHandler",
        {
-           {NULL, 0, 0, 0},                                             // gfx7
-           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},        // gfx8
-           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},        // gfx9
-           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},    // gfx90a
-           {NULL, 0, 0, 0},                                             // gfx940
-           {NULL, 0, 0, 0},                                             // gfx942
-           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},  // gfx1010
-           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},      // gfx10
-           {NULL, 0, 0, 0},                                             // gfx11
+           {NULL, 0, 0, 0},                                                 // gfx7
+           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},            // gfx8
+           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},            // gfx9
+           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},        // gfx90a
+           {NULL, 0, 0, 0},                                                 // gfx940
+           {NULL, 0, 0, 0},                                                 // gfx942
+           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},      // gfx1010
+           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},          // gfx10
+           {NULL, 0, 0, 0},                                                 // gfx11
+           // GFX12_TODO: Using one for GFX10 for now.
+           //             If NULL is used (like GFX11), get an assert.
+           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},          // gfx12
        }},
       {"TrapHandlerKfdExceptions",
        {
-           {NULL, 0, 0, 0},                                                   // gfx7
-           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},              // gfx8
-           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},        // gfx9
-           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},        // gfx90a
-           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},    // gfx940
-           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},    // gfx942
-           {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},  // gfx1010
-           {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4},      // gfx10
-           {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4},      // gfx11
+           {NULL, 0, 0, 0},                                                 // gfx7
+           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},            // gfx8
+           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},      // gfx9
+           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},      // gfx90a
+           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},  // gfx940
+           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},  // gfx942
+           {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010
+           {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4},    // gfx10
+           {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4},    // gfx11
+           {kCodeTrapHandlerV2_12, sizeof(kCodeTrapHandlerV2_12), 2, 4},    // gfx12
        }},
       {"CopyAligned",
        {
-           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},      // gfx7
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},      // gfx8
-           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},      // gfx9
-           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},      // gfx90a
-           {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12},  // gfx940
-           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},      // gfx942
-           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},    // gfx1010
-           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},    // gfx10
-           {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12},    // gfx11
+           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},          // gfx7
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx8
+           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},          // gfx9
+           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},          // gfx90a
+           {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12},      // gfx940
+           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},          // gfx942
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},        // gfx1010
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},        // gfx10
+           {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12},        // gfx11
+           {kCodeCopyAligned12, sizeof(kCodeCopyAligned12), 32, 12},        // gfx12
        }},
       {"CopyMisaligned",
        {
-           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},      // gfx7
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},      // gfx8
-           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},      // gfx9
-           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},      // gfx90a
-           {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},  // gfx940
-           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},      // gfx942
-           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},    // gfx1010
-           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},    // gfx10
-           {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10},    // gfx11
+           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},    // gfx7
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx8
+           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},    // gfx9
+           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},    // gfx90a
+           {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940
+           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},    // gfx942
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},  // gfx1010
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},  // gfx10
+           {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10},  // gfx11
+           {kCodeCopyMisaligned12, sizeof(kCodeCopyMisaligned12), 23, 10},  // gfx12
        }},
       {"Fill",
        {
-           {kCodeFill7, sizeof(kCodeFill7), 19, 8},      // gfx7
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},      // gfx8
-           {kCodeFill9, sizeof(kCodeFill9), 19, 8},      // gfx9
-           {kCodeFill9, sizeof(kCodeFill9), 19, 8},      // gfx90a
-           {kCodeFill940, sizeof(kCodeFill940), 19, 8},  // gfx940
-           {kCodeFill9, sizeof(kCodeFill9), 19, 8},      // gfx942
-           {kCodeFill10, sizeof(kCodeFill10), 19, 8},    // gfx1010
-           {kCodeFill10, sizeof(kCodeFill10), 19, 8},    // gfx10
-           {kCodeFill11, sizeof(kCodeFill11), 19, 8},    // gfx11
+           {kCodeFill7, sizeof(kCodeFill7), 19, 8},                         // gfx7
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx8
+           {kCodeFill9, sizeof(kCodeFill9), 19, 8},                         // gfx9
+           {kCodeFill9, sizeof(kCodeFill9), 19, 8},                         // gfx90a
+           {kCodeFill940, sizeof(kCodeFill940), 19, 8},                     // gfx940
+           {kCodeFill9, sizeof(kCodeFill9), 19, 8},                         // gfx942
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},                       // gfx1010
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},                       // gfx10
+           {kCodeFill11, sizeof(kCodeFill11), 19, 8},                       // gfx11
+           {kCodeFill12, sizeof(kCodeFill12), 19, 8},                       // gfx12
        }}};
 
   auto compiled_shader_it = compiled_shaders.find(func_name);
@@ -363,6 +375,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
     case 11:
         asic_shader = &compiled_shader_it->second.compute_11;
       break;
+    case 12:
+        asic_shader = &compiled_shader_it->second.compute_12;
+      break;
     default:
       assert(false && "Precompiled shader unavailable for target");
   }
@@ -449,19 +464,22 @@ void GpuAgent::InitRegionList() {
         case HSA_HEAPTYPE_GPU_LDS:
         case HSA_HEAPTYPE_GPU_SCRATCH: {
           MemoryRegion* region =
-              new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]);
+              new MemoryRegion(false, false, false, false, true, this, mem_props[mem_idx]);
 
           regions_.push_back(region);
 
           if (region->IsLocalMemory()) {
-            regions_.push_back(
-                new MemoryRegion(false, false, false, true, this, mem_props[mem_idx]));
-            // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
-            if ((properties_.HiveID != 0) ||
-                (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
+            // Extended Fine-Grain memory
+            if (!(isa_->GetMajorVersion() == 12 && isa_->GetMinorVersion() == 0))
               regions_.push_back(
-                  new MemoryRegion(true, false, false, false, this, mem_props[mem_idx]));
-            }
+                  new MemoryRegion(false, false, false, true, true, this, mem_props[mem_idx]));
+
+            // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
+            bool user_visible = (properties_.HiveID != 0) ||
+                core::Runtime::runtime_singleton_->flag().fine_grain_pcie();
+
+            regions_.push_back(new MemoryRegion(true, false, false, false, user_visible, this,
+                                                mem_props[mem_idx]));
           }
           break;
         }
@@ -502,10 +520,9 @@ void GpuAgent::InitScratchPool() {
   size_t max_scratch_len = queue_scratch_len_ * max_queues_;
 
 #if defined(HSA_LARGE_MODEL) && defined(__linux__)
-  const size_t max_scratch_device = properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC;
   // For 64-bit linux use max queues unless otherwise specified
-  if ((max_scratch_len == 0) || (max_scratch_len > max_scratch_device)) {
-    max_scratch_len = max_scratch_device;  // 4GB per XCC aperture max
+  if ((max_scratch_len == 0) || (max_scratch_len > MaxScratchDevice())) {
+    max_scratch_len = MaxScratchDevice();  // 4GB per XCC aperture max
   }
 #endif
 
@@ -536,6 +553,12 @@ void GpuAgent::InitAsyncScratchThresholds() {
 void GpuAgent::ReserveScratch()
 {
   size_t reserved_sz = core::Runtime::runtime_singleton_->flag().scratch_single_limit();
+  if (reserved_sz > MaxScratchDevice()) {
+    fprintf(stdout, "User specified scratch limit exceeds device limits (requested:%lu max:%lu)!\n", 
+                    reserved_sz, MaxScratchDevice());
+    reserved_sz = MaxScratchDevice();
+  }
+
   size_t available;
   HSAKMT_STATUS err = hsaKmtAvailableMemory(node_id(), &available);
   assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAvailableMemory failed");
@@ -646,6 +669,8 @@ hsa_status_t GpuAgent::VisitRegion(
     void* data) const {
   AMD::callback_t<decltype(callback)> call(callback);
   for (const core::MemoryRegion* region : regions) {
+    if (!region->user_visible()) continue;
+
     const AMD::MemoryRegion* amd_region =
         reinterpret_cast<const AMD::MemoryRegion*>(region);
 
@@ -687,9 +712,8 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
       break;
     case 9:
       sdma = new BlitSdmaV4();
-      copy_size_override = (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 10) ||
-                            isa_->GetMinorVersion() > 0 ? copy_size_overrides[1] :
-                                                          copy_size_overrides[0];
+      copy_size_override = (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 10) ?
+                            copy_size_overrides[1] : copy_size_overrides[0];
       break;
     case 10:
       sdma = new BlitSdmaV5();
@@ -697,6 +721,7 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
                                                          copy_size_overrides[1];
       break;
     case 11:
+    case 12:
       sdma = new BlitSdmaV5();
       copy_size_override = copy_size_overrides[1];
       break;
@@ -732,18 +757,29 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
 
 void GpuAgent::InitDma() {
   // Setup lazy init pointers on queues and blits.
-  auto queue_lambda = [this]() {
-    auto ret = CreateInterceptibleQueue();
-    if (ret == nullptr)
+  auto queue_lambda = [this](HSA_QUEUE_PRIORITY priority = HSA_QUEUE_PRIORITY_NORMAL) {
+    auto queue = CreateInterceptibleQueue();
+    if (queue == nullptr)
       throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
                                "Internal queue creation failed.");
-    return ret;
+
+    if (priority != HSA_QUEUE_PRIORITY_NORMAL)
+      if (queue->SetPriority(priority) != HSA_STATUS_SUCCESS)
+        throw AMD::hsa_exception(HSA_STATUS_ERROR,
+                                "Failed to increase queue priority for PC Sampling");
+    return queue;
   };
+
   // Dedicated compute queue for host-to-device blits.
   queues_[QueueBlitOnly].reset(queue_lambda);
   // Share utility queue with device-to-host blits.
   queues_[QueueUtility].reset(queue_lambda);
 
+  // Dedicated compute queue for PC Sampling CP-DMA commands. We need a dedicated queue that runs at
+  // highest priority because we do not want the CP-DMA commands to be delayed/blocked due to
+  // other dispatches/barriers that could be in the other AQL queues.
+  queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
+
   // Decide which engine to use for blits.
   auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
     Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
@@ -846,7 +882,7 @@ void GpuAgent::PreloadBlits() {
 
 hsa_status_t GpuAgent::PostToolsInit() {
   // Defer memory allocation until agents have been discovered.
-  InitNumaAllocator();
+  InitAllocators();
   InitScratchPool();
   BindTrapHandler();
   InitDma();
@@ -922,7 +958,6 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
   }
 
   ScopedAcquire<KernelMutex> lock(&sdma_gang_lock_);
-  if (gang_factor == 1) sdma_gang_lock_.Release();
   // Manage internal gang signals
   std::vector<core::Signal*> gang_signals;
   if (gang_factor > 1) {
@@ -1170,16 +1205,8 @@ void GpuAgent::GetInfoMemoryProperties(uint8_t value[8]) const {
   };
 
   // Fill the HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU flag
-  switch (properties_.DeviceId) {
-    case 0x15DD: /* gfx902 - Raven Ridge */
-    case 0x15D8: /* gfx909 - Raven Ridge 2 */
-    case 0x1636: /* gfx90c - Renoir */
-    case 0x74A0: /* gfx940 and gfx942-APU */
+  if (properties_.Integrated)
       setFlag(HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU);
-      break;
-    default:
-      break;
-  }
 }
 
 hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
@@ -1302,6 +1329,10 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
         setFlag(HSA_EXTENSION_IMAGES);
       }
 
+      if (core::hsa_internal_api_table_.pcs_api.hsa_ven_amd_pcs_iterate_configuration_fn != NULL) {
+        setFlag(HSA_EXTENSION_AMD_PC_SAMPLING);
+      }
+
       if (os::LibHandle lib = os::LoadLib(kAqlProfileLib)) {
         os::CloseLib(lib);
         setFlag(HSA_EXTENSION_AMD_AQLPROFILE);
@@ -1900,8 +1931,7 @@ void GpuAgent::AsyncReclaimScratchQueues() {
 }
 
 hsa_status_t GpuAgent::SetAsyncScratchThresholds(size_t use_once_limit) {
-  if (use_once_limit > properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC)
-    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  if (use_once_limit > MaxScratchDevice()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
   scratch_limit_async_threshold_ = use_once_limit;
 
@@ -2042,6 +2072,58 @@ void GpuAgent::SyncClocks() {
   assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
 }
 
+hsa_status_t GpuAgent::UpdateTrapHandlerWithPCS(void* pcs_hosttrap_buffers, void* pcs_stochastic_buffers) {
+  // Assemble the trap handler source code.
+  void* tma_addr = nullptr;
+  uint64_t tma_size = 0;
+
+  assert(core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging);
+
+  AssembleShader("TrapHandlerKfdExceptions", AssembleTarget::ISA, trap_code_buf_,
+                 trap_code_buf_size_);
+
+  /* pcs_hosttrap_buffers and pcs_stochastic_buffers are NULL until PC sampling is enabled */
+  if (pcs_hosttrap_buffers || pcs_stochastic_buffers) {
+    // ON non-large BAR systems, we cannot access device memory so we create a host copy
+    // and then do a DmaCopy to device memory
+    void* tma_region_host = (uint64_t*)system_allocator()(2 * sizeof(void*), 0x1000, 0);
+    if (tma_region_host == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    MAKE_SCOPE_GUARD([&]() { system_deallocator()(tma_region_host); });
+
+    ((uint64_t*)tma_region_host)[0] = (uint64_t)pcs_hosttrap_buffers;
+    ((uint64_t*)tma_region_host)[1] = (uint64_t)pcs_stochastic_buffers;
+
+    if (!trap_handler_tma_region_) {
+      trap_handler_tma_region_ = (uint64_t*)finegrain_allocator()(2 * sizeof(void*), 0);
+      if (trap_handler_tma_region_ == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+      // NearestCpuAgent owns pool returned system_allocator()
+      auto cpuAgent = GetNearestCpuAgent()->public_handle();
+
+      hsa_status_t ret =
+          AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, trap_handler_tma_region_);
+      assert(ret == HSA_STATUS_SUCCESS);
+    }
+
+    /* On non-large BAR systems, we may not be able to access device memory, so do a DmaCopy */
+    if (DmaCopy(trap_handler_tma_region_, tma_region_host, 2 * sizeof(void*)) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;
+
+    tma_size = 2 * sizeof(void*);
+    tma_addr = trap_handler_tma_region_;
+  } else if (trap_handler_tma_region_) {
+    finegrain_deallocator()(trap_handler_tma_region_);
+    trap_handler_tma_region_ = NULL;
+  }
+
+  // Bind the trap handler to this node.
+  HSAKMT_STATUS retKmt =
+      hsaKmtSetTrapHandler(node_id(), trap_code_buf_, trap_code_buf_size_, tma_addr, tma_size);
+
+  return (retKmt != HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
+}
+
 void GpuAgent::BindTrapHandler() {
   if (isa_->GetMajorVersion() == 7) {
     // No trap handler support on Gfx7, soft error.
@@ -2096,7 +2178,7 @@ void GpuAgent::InvalidateCodeCaches() {
       // Microcode is handling code cache invalidation.
       return;
     }
-  } else if (isa_->GetMajorVersion() > 11) {
+  } else if (isa_->GetMajorVersion() > 12) {
     assert(false && "Code cache invalidation not implemented for this agent");
   }
 
@@ -2238,7 +2320,7 @@ void GpuAgent::Trim() {
   scratch_cache_.trim(false);
 }
 
-void GpuAgent::InitNumaAllocator() {
+void GpuAgent::InitAllocators() {
   for (auto pool : GetNearestCpuAgent()->regions()) {
     if (pool->kernarg()) {
       system_allocator_ = [pool](size_t size, size_t alignment,
@@ -2252,11 +2334,29 @@ void GpuAgent::InitNumaAllocator() {
       };
 
       system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
+    }
+  }
+  assert(system_allocator_ && "Nearest NUMA node did not have a kernarg pool.");
 
-      return;
+  // Setup fine-grain allocator
+  for (auto region : regions()) {
+    const AMD::MemoryRegion* amd_region = (const AMD::MemoryRegion*)region;
+    if (amd_region->IsLocalMemory() && amd_region->fine_grain()) {
+      finegrain_allocator_ = [region](size_t size,
+                                      MemoryRegion::AllocateFlags alloc_flags) -> void* {
+        void* ptr = nullptr;
+        return (HSA_STATUS_SUCCESS ==
+                core::Runtime::runtime_singleton_->AllocateMemory(region, size, alloc_flags, &ptr))
+            ? ptr
+            : nullptr;
+      };
+
+      finegrain_deallocator_ = [](void* ptr) {
+        core::Runtime::runtime_singleton_->FreeMemory(ptr);
+      };
     }
   }
-  assert(false && "Nearest NUMA node did not have a kernarg pool.");
+  assert(finegrain_deallocator_ && "Agent does not have a fine-grain allocator");
 }
 
 core::Agent* GpuAgent::GetNearestCpuAgent() const {
@@ -2273,5 +2373,808 @@ core::Agent* GpuAgent::GetNearestCpuAgent() const {
   return nearCpu;
 }
 
+hsa_status_t ConvertHsaKmtPcSamplingInfoToHsa(HsaPcSamplingInfo* hsaKmtPcSampling,
+                                              hsa_ven_amd_pcs_configuration_t* hsaPcSampling) {
+  assert(hsaKmtPcSampling && "Invalid hsaKmtPcSampling");
+  assert(hsaPcSampling && "Invalid hsaPcSampling");
+
+  switch (hsaKmtPcSampling->method) {
+    case HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1:
+      hsaPcSampling->method = HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1;
+      break;
+    case HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1:
+      hsaPcSampling->method = HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1;
+      break;
+    default:
+      // Sampling method not supported do not return this method to the user
+      return HSA_STATUS_ERROR;
+  }
+  switch (hsaKmtPcSampling->units) {
+    case HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS:
+      hsaPcSampling->units = HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS;
+      break;
+    case HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES:
+      hsaPcSampling->units = HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES;
+      break;
+    case HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS:
+      hsaPcSampling->units = HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS;
+      break;
+    default:
+      // Sampling unit not supported do not return this method to the user
+      return HSA_STATUS_ERROR;
+  }
+
+  hsaPcSampling->min_interval = hsaKmtPcSampling->value_min;
+  hsaPcSampling->max_interval = hsaKmtPcSampling->value_max;
+  hsaPcSampling->flags = hsaKmtPcSampling->flags;
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
+                                               void* cb_data) {
+   uint32_t size = 0;
+
+  if (!core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging)
+    return HSA_STATUS_ERROR;
+
+  // First query to get size of list needed
+  HSAKMT_STATUS ret = hsaKmtPcSamplingQueryCapabilities(node_id(), NULL, 0, &size);
+  if (ret != HSAKMT_STATUS_SUCCESS || size == 0) return HSA_STATUS_ERROR;
+
+  std::vector<HsaPcSamplingInfo> sampleInfoList(size);
+  ret = hsaKmtPcSamplingQueryCapabilities(node_id(), sampleInfoList.data(), sampleInfoList.size(),
+                                          &size);
+
+  if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR;
+
+  for (uint32_t i = 0; i < size; i++) {
+    hsa_ven_amd_pcs_configuration_t hsaPcSampling;
+    if (ConvertHsaKmtPcSamplingInfoToHsa(&sampleInfoList[i], &hsaPcSampling) == HSA_STATUS_SUCCESS
+        && cb(&hsaPcSampling, cb_data) == HSA_STATUS_INFO_BREAK)
+          return HSA_STATUS_SUCCESS;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) {
+  hsa_status_t ret;
+  HsaPcSamplingInfo sampleInfo = {};
+  HsaPcSamplingTraceId thunkId;
+
+  // IOCTL id does not exist at the moment, so passing 0 is OK,
+  // since it will be overridden later in this function.
+  ret = PcSamplingCreateFromId(0, session);
+  if (ret != HSA_STATUS_SUCCESS) return ret;
+
+  session.GetHsaKmtSamplingInfo(&sampleInfo);
+  HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId);
+  if (retkmt != HSAKMT_STATUS_SUCCESS) {
+    return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY
+            : HSA_STATUS_ERROR;
+  }
+
+  debug_print("Created PC sampling session with thunkId:%d\n", thunkId);
+
+  session.SetThunkId(thunkId);
+
+  return ret;
+}
+
+hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
+                                              pcs::PcsRuntime::PcSamplingSession& session) {
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+
+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    // TODO: For now can only have 1 hosttrap session at a time. As a final solution, we want to be
+    // able to support multiple sessions at a time. But this makes the session.HandleSampleData more
+    // complicated if multiple sessions have different buffer sizes.
+    if (ht_data.session) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    // This is current amd_aql_queue->pm4_ib_size_b_
+    ht_data.cmd_data_sz = 0x1000;
+    ht_data.cmd_data = (uint32_t*)malloc(ht_data.cmd_data_sz);
+    assert(ht_data.cmd_data);
+
+    if (HSA::hsa_signal_create(1, 0, NULL, &ht_data.exec_pm4_signal) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;
+
+    ht_data.old_val = (uint64_t*)system_allocator()(sizeof(uint64_t), 0x1000, 0);
+    assert(ht_data.old_val);
+
+    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.old_val))
+      return HSA_STATUS_ERROR;
+
+    // Local copy of hosttrap data - we cannot access device memory directly on non-large BAR
+    // systems
+    pcs_hosttrap_sampling_data_t* device_datahost =
+        (pcs_hosttrap_sampling_data_t*)system_allocator()(sizeof(*device_datahost), 0x1000, 0);
+    if (!device_datahost) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    MAKE_SCOPE_GUARD([&]() { system_deallocator()(device_datahost); });
+
+    memset(device_datahost, 0, sizeof(*device_datahost));
+
+    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, device_datahost) !=
+        HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;
+
+    MAKE_NAMED_SCOPE_GUARD(freeHostTrapResources, [&]() {
+      if (ht_data.device_data) {
+        if (ht_data.device_data->done_sig0.handle)
+          HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
+        if (ht_data.device_data->done_sig1.handle)
+          HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
+
+        finegrain_deallocator()(ht_data.device_data);
+      }
+      if (ht_data.host_buffer) system_deallocator()(ht_data.host_buffer);
+    });
+
+    // Force creating of PC Sampling queue to trigger exception early in case we exceed max availble
+    // CP queues on this agent
+    queues_[QueuePCSampling].touch();
+
+    /*
+     * When calling queue->ExecutePM4() Indirect Buffer size which is 0x1000 bytes (1024 DW).
+     * The maximum indirect buffer size we need occurs when we enqueue the
+     * WAIT_REG_MEM, DMA_COPY(s), WRITE_DATA ops:
+     * For WAIT_REG_MEM = 7 DW
+     * For each DMA_COPY = 7 DW
+     * For WRITE_DATA_CMD = 6 DW
+     *
+     * So maximum number of DMA_COPY ops is:
+     * (MAX_IB_SIZE - sizeof(WAIT_REG_MEM) - sizeof(WRITE_DATA_CMD)) / sizeof(DMA_COPY)
+     * (1024 - 7 - 6) / 7 = 144
+     *
+     * Each DMA_COPY op can transfer (1 << 26) bytes, which is 9 GB. trap_buffer_size is a 32-bit
+     * number, so the buffer must be < 4 GB. So we are not limited by Indirect Buffer size.
+     * Set current limit to 256 MB to limit device VRAM usage
+     */
+    const size_t max_trap_buffer_size =
+        core::Runtime::runtime_singleton_->flag().pc_sampling_max_device_buffer_size();
+
+    /*
+     * We use a double-buffer mechanism where there are 2 trap-buffers and 1 host-buffer
+     * Warning: This currently assumes that client latency is smaller than time to fill 1
+     * trap-buffer If latency is bigger, we have to increate host-buffer
+     *
+     * host-buffer must be >= client-buffer so that we can copy full size of client-buffer each
+     * time. To avoid having to deal with wrap-arounds, host-buffer must be a multiple of
+     * trap-buffers
+     *
+     * if client-buffer size is greater than 2x max_trap_buffer_size:
+     *    We are limited by max_trap_buffer_size.
+     *    trap-buffer = max-trap-buffer-size
+     *    host-buffer = 2*smallest size greater than client-buffer but multiple of 1 trap-buffer
+     * else:
+     *    We reduce the trap-buffers so that:
+     *    trap-buffer = half of user-buffer
+     *    host-buffer = 2*user-buffer
+     *
+     * TODO: We are currently using a temporary host-buffer so that we can increase host-buffer to
+     * factor in client latency. Using a direct-copy to the client buffer would be more efficient.
+     * Revisit this once we have empirical data of latency vs how long it takes to fill 1
+     * trap-buffer.
+     */
+
+    size_t trap_buffer_size = 0;
+    if (session.buffer_size() > 2 * max_trap_buffer_size) {
+      trap_buffer_size = max_trap_buffer_size;
+      ht_data.host_buffer_size = 2 * AlignUp(session.buffer_size(), trap_buffer_size);
+    } else {
+      trap_buffer_size = session.buffer_size() / 2;
+      ht_data.host_buffer_size = 2 * session.buffer_size();
+    }
+
+    ht_data.host_buffer = (uint8_t*)system_allocator()(ht_data.host_buffer_size, 0x1000, 0);
+    if (!ht_data.host_buffer) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    if (AMD::hsa_amd_agents_allow_access(1, &public_handle_, NULL, ht_data.host_buffer) !=
+        HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR;
+
+    device_datahost->buf_size = trap_buffer_size / session.sample_size();
+
+    if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig0) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    if (HSA::hsa_signal_create(1, 0, NULL, &device_datahost->done_sig1) != HSA_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    // TODO: Once we have things working and can measure
+    // latency after 2nd level trap handler decrements signals and set watermark accordingly
+    device_datahost->buf_watermark0 = 0.8 * device_datahost->buf_size;
+    device_datahost->buf_watermark1 = 0.8 * device_datahost->buf_size;
+
+    // Allocate device memory for 2nd level trap handler TMA
+    size_t deviceAllocSize = sizeof(*ht_data.device_data) + (2 * trap_buffer_size);
+    ht_data.device_data = (pcs_hosttrap_sampling_data_t*)finegrain_allocator()(deviceAllocSize, 0);
+    if (ht_data.device_data == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+    // This cpuAgent is the owner of the system_allocator() pool
+    auto cpuAgent = GetNearestCpuAgent()->public_handle();
+    hsa_status_t ret = AMD::hsa_amd_agents_allow_access(1, &cpuAgent, NULL, ht_data.device_data);
+    assert(ret == HSA_STATUS_SUCCESS);
+
+    if (DmaCopy(ht_data.device_data, device_datahost, sizeof(*device_datahost)) !=
+        HSA_STATUS_SUCCESS) {
+      debug_print("Failed to dmaCopy!\n");
+      return HSA_STATUS_ERROR;
+    }
+
+    uint8_t* device_buf_ptr =
+        ((uint8_t*)ht_data.device_data) + sizeof(pcs_hosttrap_sampling_data_t);
+    if (DmaFill(device_buf_ptr, 0, deviceAllocSize - sizeof(pcs_hosttrap_sampling_data_t)) !=
+        HSA_STATUS_SUCCESS) {
+      debug_print("Failed to dmaFill!\n");
+      return HSA_STATUS_ERROR;
+    }
+
+    ht_data.lost_sample_count = 0;
+    ht_data.host_buffer_wrap_pos = 0;
+    ht_data.host_write_ptr = ht_data.host_buffer;
+    ht_data.host_read_ptr = ht_data.host_write_ptr;
+
+    ht_data.session = &session;
+    freeHostTrapResources.Dismiss();
+
+    if (UpdateTrapHandlerWithPCS(ht_data.device_data, NULL) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
+  }
+
+  session.SetThunkId(ioctlId);
+  ht_data.session = &session;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) {
+  if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR;
+
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+  HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId());
+  ht_data.session = NULL;
+
+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    free(ht_data.cmd_data);
+    system_deallocator()(ht_data.old_val);
+    HSA::hsa_signal_destroy(ht_data.exec_pm4_signal);
+    HSA::hsa_signal_destroy(ht_data.device_data->done_sig0);
+    HSA::hsa_signal_destroy(ht_data.device_data->done_sig1);
+    finegrain_deallocator()(ht_data.device_data);
+    system_deallocator()(ht_data.host_buffer);
+
+    ht_data.device_data = NULL;
+    ht_data.host_buffer = NULL;
+    ht_data.session = NULL;
+
+    UpdateTrapHandlerWithPCS(NULL, NULL);
+  }
+  return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
+}
+
+hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) {
+  if (session.isActive()) return HSA_STATUS_SUCCESS;
+
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+
+  auto method = session.method();
+  if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    if (ht_data.session->isActive()) {
+      debug_warning("Already have a Host trap session in progress!");
+      return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY;
+    }
+    ht_data.session->start();
+    // This thread will handle all hosttrap sessions on this agent
+    // In the future, there will be another thread to handle stochastic sessions.
+    ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this);
+    if (!ht_data.thread)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
+                               "Failed to start PC Sampling thread.");
+  }
+
+  if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS)
+    return HSA_STATUS_SUCCESS;
+
+  debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId());
+  if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    ht_data.session->stop();
+    os::WaitForThread(ht_data.thread);
+    os::CloseThread(ht_data.thread);
+    ht_data.thread = NULL;
+  }
+
+  return HSA_STATUS_ERROR;
+}
+
+hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) {
+  if (!session.isActive()) return HSA_STATUS_SUCCESS;
+
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+
+  session.stop();
+
+  HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId());
+  if (retKmt != HSAKMT_STATUS_SUCCESS)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session.");
+
+  if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
+    // Wake up pcs_hosttrap_thread_ if it is waiting for data
+    HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig0, -1);
+    HSA::hsa_signal_store_screlease(ht_data.device_data->done_sig1, -1);
+
+    os::WaitForThread(ht_data.thread);
+    os::CloseThread(ht_data.thread);
+    ht_data.thread = NULL;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t GpuAgent::PcSamplingFlushHostTrapDeviceBuffers(
+    pcs::PcsRuntime::PcSamplingSession& session) {
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+  uint32_t& which_buffer = ht_data.which_buffer;
+  uint32_t* cmd_data = ht_data.cmd_data;
+  size_t& cmd_data_sz = ht_data.cmd_data_sz;
+  uint64_t* old_val = ht_data.old_val;
+  hsa_signal_t& exec_pm4_signal = ht_data.exec_pm4_signal;
+
+  /*
+   * Device-buffer to Host-buffer to User-Buffer copy logic
+   *
+   * Device-buffer = buffer written by 2nd level trap handler
+   * Host-buffer = buffer inside ROCr
+   * User-buffer = Session buffer size specified in PCSamplingSessionCreate
+   *
+   * Conditions for the buffer sizes:
+   * Host buffer is at least 2 times bigger than device buffer and Host buffer
+   * is also at least 2 times bigger than User-Buffer.
+   *
+   * Key:
+   * Device-Buffer[==--][----] : Device-Buffer#1 has size 4*N, and is half-full
+   *                             Device-Buffer#2 has size 4*N and is empty
+   *
+   * Host-Buffer[=---------] : Host Buffer has size 10*N and is filled with N.
+   *
+   * N will vary based on the User-buffer size, this example is to show the
+   * relative sizes between each copy.
+   *
+   * 1. Initial state
+   *    - User has created a new session with buffer size = 7*N
+   *
+   *    Device-Buffer[---][---]
+   *    Host-Buffer[--------------] wptr=0 rptr=0 wrap_pos=0
+   *    User-Buffer[-------]
+   *
+   *    -- Device Buffer has size 3*N
+   *    -- Host-Buffer has size 14*N (2x User-Buffer)
+   *    -- User-Buffer has size 7*N
+   *
+   * 2. Device Buffer#1 hits watermark
+   *    State at beginning:
+   *    Device-Buffer[===][---]
+   *    Host-Buffer[--------------]
+   *    User-Buffer[-------]
+   *
+   *    -- Copy 3*N from Device-Buffer#1 to Host-Buffer
+   *    -- In the meantime, 2nd level trap handler is writing to Device-Buffer#2
+   *    -- We do not have enough data to fill User-Buffer
+   *
+   *    State at end:
+   *    Device-Buffer[---][=--]
+   *    Host-Buffer[===-----------] wptr=3 rptr=0, wrap_pos=0
+   *    User-Buffer[-------]
+   *
+   * 3. Device Buffer#2 hits watermark
+   *    State at beginning:
+   *    Device-Buffer[---][===]
+   *    Host-Buffer[===-----------]
+   *    User-Buffer[-------]
+   *
+   *    -- Copy 3*N from Device-Buffer#2 to Host-Buffer
+   *    -- In the meantime, 2nd level trap handler is writing to Device-Buffer#1
+   *    -- We do not have enough data to fill User-Buffer
+   *
+   *    State at end:
+   *    Device-Buffer[=--][---]
+   *    Host-Buffer[======--------] wptr=6 rptr=0 wrap_pos=0
+   *    User-Buffer[-------]
+   *
+   * 4. Device Buffer#1 hits watermark
+   *    State at beginning:
+   *    Device-Buffer[---][===]
+   *    Host-Buffer[======--------]
+   *    User-Buffer[-------]
+   *
+   *    -- Copy 3*N from Device-Buffer#2 to Host-Buffer
+   *    -- In the meantime, 2nd level trap handler is writing to Device-Buffer#1
+   *
+   *    Device-Buffer[=--][---]
+   *    Host-Buffer[=========-----]
+   *    User-Buffer[-------]
+   *
+   *    -- We have enough data to fill User-Buffer. Callback user data-ready to
+   *    -- copy 7*N to user.
+   *
+   *    Device-Buffer[=--][---]
+   *    Host-Buffer[-------==-----]
+   *    User-Buffer[=======]
+   *
+   *    -- User processes User-Buffer
+   *
+   *    Device-Buffer[=--][---]
+   *    Host-Buffer[-------==-----] wptr=9 rptr=7 wrap_pos=0
+   *    User-Buffer[-------]
+   *
+   * 6. Device Buffer#1 hits watermark
+   *    State at end:
+   *    Device-Buffer[---][=--]
+   *    Host-Buffer[-------=====--] wptr=12 rptr=7 wrap_pos=0
+   *    User-Buffer[-------]
+   *
+   * 7. Device Buffer#2 hits watermark
+   *    State at beginning:
+   *    Device-Buffer[---][===]
+   *    Host-Buffer[-------=====--] wptr=12 rptr=7 wrap_pos=0
+   *    User-Buffer[-------]
+   *
+   *    -- We do not have enough space after wptr. The CP-DMA copy
+   *    -- can only copy a contiguous range, so copy to the
+   *    -- beginning of Host-Buffer and set wrap_pos
+   *
+   *    Device-Buffer[=--][---]
+   *    Host-Buffer[===----=====--] wptr=3 rptr=7 wrap_pos=12
+   *    User-Buffer[-------]
+   *
+   *    -- We have enough data to fill User-Buffer. Callback user data-ready to
+   *    -- copy 7*N to user. We copy the tail end (index 7-12) of Host-Buffer
+   *    -- before copying the beginning of Host-Buffer (index 0-2).
+   *
+   *    Device-Buffer[=--][---]
+   *    Host-Buffer[--=-----------] wptr=3 rptr=2 wrap_pos=0
+   *    User-Buffer[=======]
+   *
+   *     -- User processes User-Buffer
+   *
+   * 8. Device Buffer#1 hits watermark
+   *    State at end:
+   *    Device-Buffer[---][=--]
+   *    Host-Buffer[--====--------] wptr=6 rptr=2 wrap_pos=0
+   *    User-Buffer[-------]
+   */
+
+  uint32_t next_buffer;
+
+  uint64_t reset_write_val;
+  uint32_t to_copy, copy_bytes;
+
+  const uint32_t atomic_ex_cmd_sz = 9;
+  const uint32_t wait_reg_mem_cmd_sz = 7;
+  const uint32_t dma_data_cmd_sz = 7;
+  const uint32_t copy_data_cmd_sz = 6;
+  const uint32_t write_data_cmd_sz = 5;
+
+  uint8_t* host_buffer_begin = ht_data.host_buffer;
+  uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
+
+  uint64_t buf_write_val = (uint64_t) & (ht_data.device_data->buf_write_val);
+  uint64_t buf_written_val[] = {(uint64_t) & (ht_data.device_data->buf_written_val0),
+                                (uint64_t) & (ht_data.device_data->buf_written_val1)};
+
+  size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) +
+      sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1);
+
+  hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
+  uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset,
+                       (uint8_t*)ht_data.device_data + buf_offset +
+                           ht_data.device_data->buf_size * session.sample_size()};
+
+  next_buffer = (which_buffer + 1) % 2;
+  reset_write_val = (uint64_t)next_buffer << 63;
+
+  /*
+   * ATOMIC_MEM, perform atomic_exchange
+   * We use a double-buffer mechanism so that trap handlers calls are writing to one buffer while
+   * hsa-runtime is copying data from the other buffer.
+   *
+   * 1. Atomically swap buffers on the device. Future trap handler calls will put their data into
+   *    next_buffer.
+   * 2. Return a 64-bit packed value to ROCr; the upper bit is the old buffer and can be ignored.
+   *    The lower 63 bits are how many trap handler entrances happened before the atomic swap
+   *    i.e., what value to wait for in buf_written_val to know all previous trap entries were
+   *    done.
+   */
+  unsigned int i = 0;
+  memset(cmd_data, 0, cmd_data_sz);
+  cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_ATOMIC_MEM, atomic_ex_cmd_sz, isa_->GetMajorVersion());
+  cmd_data[i++] = PM4_ATOMIC_MEM_DW1_ATOMIC(PM4_ATOMIC_MEM_GL2_OP_ATOMIC_SWAP_RTN_64);
+  cmd_data[i++] = PM4_ATOMIC_MEM_DW2_ADDR_LO(buf_write_val);
+  cmd_data[i++] = PM4_ATOMIC_MEM_DW3_ADDR_HI((buf_write_val) >> 32);
+  cmd_data[i++] = PM4_ATOMIC_MEM_DW4_SRC_DATA_LO((uint64_t)reset_write_val);
+  cmd_data[i++] = PM4_ATOMIC_MEM_DW5_SRC_DATA_HI(((uint64_t)reset_write_val) >> 32);
+  i += 3;
+  /* copy data */
+  cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_COPY_DATA, copy_data_cmd_sz, isa_->GetMajorVersion());
+  cmd_data[i++] =
+      PM4_COPY_DATA_DW1(PM4_COPY_DATA_SRC_SEL_ATOMIC_RETURN_DATA | PM4_COPY_DATA_DST_SEL_TC_12 |
+                        PM4_COPY_DATA_COUNT_SEL | PM4_COPY_DATA_WR_CONFIRM);
+  i += 2;
+  cmd_data[i++] = PM4_COPY_DATA_DW4_DST_ADDR_LO((uint64_t)old_val);
+  cmd_data[i++] = PM4_COPY_DATA_DW5_DST_ADDR_HI(((uint64_t)old_val) >> 32);
+
+  HSA::hsa_signal_store_screlease(exec_pm4_signal, 1);
+
+  queues_[QueuePCSampling]->ExecutePM4(
+      cmd_data, (atomic_ex_cmd_sz + copy_data_cmd_sz) * sizeof(uint32_t), HSA_FENCE_SCOPE_NONE,
+      HSA_FENCE_SCOPE_SYSTEM, &exec_pm4_signal);
+  do {
+    hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
+        exec_pm4_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+    if (val == -1) return HSA_STATUS_SUCCESS;
+    if (val == 0) break;
+  } while (true);
+
+  *old_val &= (ULLONG_MAX >> 1);
+  /* If the number of entries in old_val is larger than buf_size, then there was a buffer overflow
+   * and the 2nd level trap handler code will skip recording samples, causing lost samples
+   */
+  if (*old_val > (uint64_t)ht_data.device_data->buf_size) {
+    ht_data.lost_sample_count = *old_val - (uint64_t)ht_data.device_data->buf_size;
+    *old_val = (uint64_t)ht_data.device_data->buf_size;
+  }
+
+  to_copy = *old_val * session.sample_size();
+
+  /* Make sure there is enough space after host_write_ptr */
+  if (ht_data.host_write_ptr + to_copy >= host_buffer_end) {
+    // Need to wrap around
+    ht_data.host_buffer_wrap_pos = ht_data.host_write_ptr;
+    ht_data.host_write_ptr = host_buffer_begin;
+  }
+  /*
+   * Do the WAIT_REG_MEM, DMA_DATA(s) and WRITE_DATA
+   *
+   * 1. Wait for all trap handlers have finished writing values to this buffer by waiting for
+   *    buf_written_val to equal to old_val.
+   * 2. Copy the values out of buffer to the host buffers.
+   * 3. Reset buf_written_val so that we start writing to beginning of this buffer on the next
+   *    buffer swap.
+   */
+  i = 0;
+  memset(cmd_data, 0, cmd_data_sz);
+
+  /* WAIT_REG_MEM, wait on buf_written_val */
+  cmd_data[i++] =
+      PM4_HDR(PM4_HDR_IT_OPCODE_WAIT_REG_MEM, wait_reg_mem_cmd_sz, isa_->GetMajorVersion());
+  cmd_data[i++] = PM4_WAIT_REG_MEM_DW1(PM4_WAIT_REG_MEM_FUNCTION_EQUAL_TO_REFERENCE |
+                                       PM4_WAIT_REG_MEM_MEM_SPACE_MEMORY_SPACE |
+                                       PM4_WAIT_REG_MEM_OPERATION_WAIT_REG_MEM);
+  cmd_data[i++] = PM4_WAIT_REG_MEM_DW2_MEM_POLL_ADDR_LO(buf_written_val[which_buffer]);
+  cmd_data[i++] = PM4_WAIT_REG_MEM_DW3_MEM_POLL_ADDR_HI((buf_written_val[which_buffer]) >> 32);
+  cmd_data[i++] = PM4_WAIT_REG_MEM_DW4_REFERENCE(*old_val);
+  cmd_data[i++] = 0xFFFFFFFF;
+  cmd_data[i++] = PM4_WAIT_REG_MEM_DW6(PM4_WAIT_REG_MEM_POLL_INTERVAL(4) |
+                                       PM4_WAIT_REG_MEM_OPTIMIZE_ACE_OFFLOAD_MODE);
+
+  unsigned int num_copy_command = 0;
+  uint8_t* buffer_temp = buffer[which_buffer];
+  for (copy_bytes = CP_DMA_DATA_TRANSFER_CNT_MAX; 0 < to_copy; to_copy -= copy_bytes) {
+    num_copy_command++;
+
+    /* DMA_DATA PACKETS, copy buffer using CPDMA */
+    cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_DMA_DATA, dma_data_cmd_sz, isa_->GetMajorVersion());
+    cmd_data[i++] = PM4_DMA_DATA_DW1(PM4_DMA_DATA_DST_SEL_DST_ADDR_USING_L2 |
+                                     PM4_DMA_DATA_SRC_SEL_SRC_ADDR_USING_L2);
+    cmd_data[i++] = PM4_DMA_DATA_DW2_SRC_ADDR_LO((uint64_t)buffer_temp);
+    cmd_data[i++] = PM4_DMA_DATA_DW3_SRC_ADDR_HI(((uint64_t)buffer_temp) >> 32);
+    cmd_data[i++] = PM4_DMA_DATA_DW4_DST_ADDR_LO((uint64_t)ht_data.host_write_ptr);
+    cmd_data[i++] = PM4_DMA_DATA_DW5_DST_ADDR_HI(((uint64_t)ht_data.host_write_ptr) >> 32);
+
+    if (copy_bytes >= to_copy) {
+      copy_bytes = to_copy;
+      cmd_data[i++] =
+          PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC_LAST);
+    } else {
+      cmd_data[i++] = PM4_DMA_DATA_DW6(PM4_DMA_DATA_BYTE_COUNT(copy_bytes) | PM4_DMA_DATA_DIS_WC);
+    }
+
+    buffer_temp += copy_bytes;
+    ht_data.host_write_ptr += copy_bytes;
+  }
+
+  /* WRITE_DATA, Reset buf_written_val */
+  cmd_data[i++] = PM4_HDR(PM4_HDR_IT_OPCODE_WRITE_DATA, write_data_cmd_sz, isa_->GetMajorVersion());
+  cmd_data[i++] = PM4_WRITE_DATA_DW1(PM4_WRITE_DATA_DST_SEL_TC_L2 |
+                                     PM4_WRITE_DATA_WR_CONFIRM_WAIT_CONFIRMATION);
+  cmd_data[i++] = PM4_WRITE_DATA_DW2_DST_MEM_ADDR_LO(buf_written_val[which_buffer]);
+  cmd_data[i++] = PM4_WRITE_DATA_DW3_DST_MEM_ADDR_HI((buf_written_val[which_buffer]) >> 32);
+  cmd_data[i++] = PM4_WRITE_DATA_DW4_DATA(0);
+
+  unsigned int cmd_sz =
+      wait_reg_mem_cmd_sz + (num_copy_command * dma_data_cmd_sz) + write_data_cmd_sz;
+
+  HSA::hsa_signal_store_screlease(exec_pm4_signal, 1);
+  queues_[QueuePCSampling]->ExecutePM4(cmd_data, cmd_sz * sizeof(uint32_t), HSA_FENCE_SCOPE_NONE,
+                                       HSA_FENCE_SCOPE_SYSTEM, &exec_pm4_signal);
+  do {
+    hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
+        exec_pm4_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+    if (val == -1) return HSA_STATUS_SUCCESS;
+    if (val == 0) break;
+  } while (true);
+
+  which_buffer = next_buffer;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void GpuAgent::PcSamplingThread() {
+  // TODO: Implement lost sample count
+  // TODO: Implement latency
+
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+  pcs::PcsRuntime::PcSamplingSession& session = *ht_data.session;
+  uint32_t& which_buffer = ht_data.which_buffer;
+
+  hsa_status_t ret = HSA_STATUS_SUCCESS;
+  uint8_t* host_buffer_begin = ht_data.host_buffer;
+  uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
+
+  size_t const buf_offset = offsetof(pcs_hosttrap_sampling_data_t, reserved1) +
+      sizeof(((pcs_hosttrap_sampling_data_t*)0)->reserved1);
+
+  hsa_signal_t done_sig[] = {ht_data.device_data->done_sig0, ht_data.device_data->done_sig1};
+  uint8_t* buffer[] = {(uint8_t*)ht_data.device_data + buf_offset,
+                       (uint8_t*)ht_data.device_data + buf_offset +
+                           ht_data.device_data->buf_size * session.sample_size()};
+
+  while (ht_data.session->isActive()) {
+    do {
+      hsa_signal_value_t val = HSA::hsa_signal_wait_scacquire(
+          done_sig[which_buffer], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+      if (val == -1) goto thread_exit;
+      if (val == 0) break;
+    } while (true);
+    HSA::hsa_signal_store_screlease(done_sig[which_buffer], 1);
+
+    std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
+    if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
+      goto thread_exit;
+
+    size_t bytes_before_wrap;
+    size_t bytes_after_wrap;
+
+    assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
+    assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
+    assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
+                                        : (ht_data.host_read_ptr <= ht_data.host_write_ptr));
+
+    if (ht_data.host_buffer_wrap_pos) {
+      assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
+             ht_data.host_buffer_wrap_pos > host_buffer_begin);
+      assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
+
+      // Wrapped around
+      bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
+      bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
+
+      while (bytes_before_wrap >= session.buffer_size()) {
+        session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
+                                 ht_data.lost_sample_count);
+        ht_data.host_read_ptr += session.buffer_size();
+        bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
+        ht_data.lost_sample_count = 0;
+      }
+
+      if (bytes_before_wrap + bytes_after_wrap >= session.buffer_size()) {
+        session.HandleSampleData(ht_data.host_read_ptr, bytes_before_wrap, host_buffer_begin,
+                                 (session.buffer_size() - bytes_before_wrap), 0);
+        ht_data.host_read_ptr = host_buffer_begin + (session.buffer_size() - bytes_before_wrap);
+        bytes_before_wrap = 0;
+        ht_data.host_buffer_wrap_pos = 0;
+        bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+        ht_data.lost_sample_count = 0;
+      }
+
+      while (bytes_after_wrap >= session.buffer_size()) {
+        session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
+                                 ht_data.lost_sample_count);
+        ht_data.host_read_ptr += session.buffer_size();
+        bytes_before_wrap = 0;
+        bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+        ht_data.lost_sample_count = 0;
+      }
+    } else {
+      bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+
+      while (bytes_before_wrap >= session.buffer_size()) {
+        assert(ht_data.host_read_ptr >= host_buffer_begin &&
+               ht_data.host_read_ptr + session.buffer_size() < host_buffer_end);
+        session.HandleSampleData(ht_data.host_read_ptr, session.buffer_size(), NULL, 0,
+                                 ht_data.lost_sample_count);
+        ht_data.host_read_ptr += session.buffer_size();
+        bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+        ht_data.lost_sample_count = 0;
+      }
+    }
+  }
+thread_exit:
+  debug_print("PcSamplingThread::Exiting\n");
+}
+
+void GpuAgent::PcSamplingThreadRun(void* _agent) {
+  GpuAgent* agent = (GpuAgent*)_agent;
+  agent->PcSamplingThread();
+  debug_print("PcSamplingThread exiting...");
+}
+
+hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) {
+  pcs_hosttrap_t& ht_data = pcs_hosttrap_data_;
+
+  uint8_t* host_buffer_begin = ht_data.host_buffer;
+  uint8_t* host_buffer_end = ht_data.host_buffer + ht_data.host_buffer_size;
+
+  size_t bytes_before_wrap;
+  size_t bytes_after_wrap;
+
+  std::lock_guard<std::mutex> lock(ht_data.host_buffer_mutex);
+  if (PcSamplingFlushHostTrapDeviceBuffers(session) != HSA_STATUS_SUCCESS)
+    return HSA_STATUS_ERROR;
+
+  assert(ht_data.host_read_ptr >= host_buffer_begin && ht_data.host_read_ptr < host_buffer_end);
+  assert(ht_data.host_write_ptr >= host_buffer_begin && ht_data.host_write_ptr < host_buffer_end);
+  assert(ht_data.host_buffer_wrap_pos ? (ht_data.host_read_ptr > ht_data.host_write_ptr)
+                                      : (ht_data.host_read_ptr <= ht_data.host_write_ptr));
+
+  if (ht_data.host_buffer_wrap_pos) {
+    assert(ht_data.host_buffer_wrap_pos <= host_buffer_end &&
+           ht_data.host_buffer_wrap_pos > host_buffer_begin);
+    assert(ht_data.host_read_ptr <= ht_data.host_buffer_wrap_pos);
+
+    // Wrapped around
+    bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
+    bytes_after_wrap = ht_data.host_write_ptr - host_buffer_begin;
+
+    while (bytes_before_wrap > 0) {
+      size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
+
+      session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
+                               ht_data.lost_sample_count);
+      ht_data.host_read_ptr += bytes_to_copy;
+      bytes_before_wrap = ht_data.host_buffer_wrap_pos - ht_data.host_read_ptr;
+      ht_data.lost_sample_count = 0;
+    }
+
+    assert(ht_data.host_read_ptr == ht_data.host_buffer_wrap_pos);
+    ht_data.host_buffer_wrap_pos = 0;
+    ht_data.host_read_ptr = host_buffer_begin;
+
+    while (bytes_after_wrap > 0) {
+      size_t bytes_to_copy = std::min(bytes_after_wrap, session.buffer_size());
+
+      session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
+                               ht_data.lost_sample_count);
+      ht_data.host_read_ptr += bytes_to_copy;
+      bytes_after_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+      ht_data.lost_sample_count = 0;
+    }
+  } else {
+    bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+
+    while (bytes_before_wrap) {
+      size_t bytes_to_copy = std::min(bytes_before_wrap, session.buffer_size());
+      assert(ht_data.host_read_ptr >= host_buffer_begin &&
+             ht_data.host_read_ptr + bytes_to_copy <= host_buffer_end);
+
+      session.HandleSampleData(ht_data.host_read_ptr, bytes_to_copy, NULL, 0,
+                               ht_data.lost_sample_count);
+      ht_data.host_read_ptr += bytes_to_copy;
+      bytes_before_wrap = ht_data.host_write_ptr - ht_data.host_read_ptr;
+      ht_data.lost_sample_count = 0;
+    }
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
 }  // namespace amd
 }  // namespace rocr
diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
index f664102fd..7d38a83b4 100644
--- a/src/core/runtime/amd_memory_region.cpp
+++ b/src/core/runtime/amd_memory_region.cpp
@@ -50,12 +50,14 @@
 #include "core/inc/amd_gpu_agent.h"
 #include "core/util/utils.h"
 #include "core/inc/exceptions.h"
+#include <unistd.h>
 
 namespace rocr {
 namespace AMD {
 
 // Tracks aggregate size of system memory available on platform
 size_t MemoryRegion::max_sysmem_alloc_size_ = 0;
+size_t MemoryRegion::kPageSize_ = sysconf(_SC_PAGESIZE);
 
 void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) {
   void* ret = NULL;
@@ -100,9 +102,10 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
 }
 
 MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
-                           bool extended_scope_fine_grain, core::Agent* owner,
+                           bool extended_scope_fine_grain, bool user_visible, core::Agent* owner,
                            const HsaMemoryProperties& mem_props)
-    : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, owner),
+    : core::MemoryRegion(fine_grain, kernarg, full_profile, extended_scope_fine_grain, user_visible,
+                         owner),
       mem_props_(mem_props),
       max_single_alloc_size_(0),
       virtual_size_(0),
@@ -136,7 +139,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
     virtual_size_ = kGpuVmSize;
 
   } else if (IsSystem()) {
-    mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB;
+    mem_flag_.ui32.PageSize = MemoryRegion::kPageSize_;
     mem_flag_.ui32.NoSubstitute = 0;
     mem_flag_.ui32.HostAccess = 1;
     mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED;
@@ -161,7 +164,6 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
   }
 
   assert(GetVirtualSize() != 0);
-  assert(GetPhysicalSize() <= GetVirtualSize());
   assert(IsMultipleOf(max_single_alloc_size_, kPageSize_));
 }
 
@@ -207,6 +209,12 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
   kmt_alloc_flags.ui32.NoSubstitute = (alloc_flags & AllocatePinned ? 1 : kmt_alloc_flags.ui32.NoSubstitute);
 
   kmt_alloc_flags.ui32.GTTAccess = (alloc_flags & AllocateGTTAccess ? 1 : kmt_alloc_flags.ui32.GTTAccess);
+  if (IsLocalMemory()) {
+    // Allocate physically contiguous memory - AllocateKfdMemory function call will fail
+    // if this flag is not supported in KFD.
+    kmt_alloc_flags.ui32.Contiguous =
+        (alloc_flags & AllocateContiguous ? 1 : kmt_alloc_flags.ui32.Contiguous);
+  }
 
   // Only allow using the suballocator for ordinary VRAM.
   if (IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
@@ -237,9 +245,9 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
     *address = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
   }
 
-  if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
-
   if (*address != nullptr) {
+    if (kmt_alloc_flags.ui32.NoAddress) return HSA_STATUS_SUCCESS;
+
     // Commit the memory.
     // For system memory, on non-restricted allocation, map it to all GPUs. On
     // restricted allocation, only CPU is allowed to access by default, so
diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp
index 89c5cd260..e595bffac 100644
--- a/src/core/runtime/amd_topology.cpp
+++ b/src/core/runtime/amd_topology.cpp
@@ -367,6 +367,17 @@ void BuildTopology() {
   }
   const_cast<Flag&>(core::Runtime::runtime_singleton_->flag()).parse_masks(maxGpu, maxCu);
 
+  // Temporary work-around, disable SDMA ganging on non-APUs in non-SPX modes
+  // Check xGMI APU status
+  bool isXgmiApu = false;
+  auto& firstCpu = core::Runtime::runtime_singleton_->cpu_agents()[0];
+  for (auto& peer_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
+    auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(firstCpu->node_id(),
+                                                                peer_gpu->node_id());
+    isXgmiApu = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI;
+    if (isXgmiApu) break;
+  }
+
   // Register destination agents that can SDMA gang copy for source agents
   for (auto& src_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
     uint32_t src_id = src_gpu->node_id();
@@ -383,7 +394,7 @@ void BuildTopology() {
         // Weight of 41 - Inter-socket GPU link in multi-partition mode
         if (linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
           if (linfo.info.numa_distance == 13 || linfo.info.numa_distance == 41)
-            gang_factor = 2;
+            gang_factor = isXgmiApu ? 2 : 1;
           else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth)
             gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth;
           else gang_factor = 1;
diff --git a/src/core/runtime/blit_shaders/CMakeLists.txt b/src/core/runtime/blit_shaders/CMakeLists.txt
index dc32b2f2b..e63d380b8 100644
--- a/src/core/runtime/blit_shaders/CMakeLists.txt
+++ b/src/core/runtime/blit_shaders/CMakeLists.txt
@@ -49,9 +49,10 @@ find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm
 find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
 
 # Set the target devices
-set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100")
+set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100;gfx1200")
+
 # Set the postfix for each target device
-set (POSTFIX "9;940;1010;10;11")
+set (POSTFIX "9;940;1010;10;11;12")
 
 # If verbose output is enabled, print paths and target devices
 if(${CMAKE_VERBOSE_MAKEFILE})
diff --git a/src/core/runtime/blit_shaders/blit_copyAligned.s b/src/core/runtime/blit_shaders/blit_copyAligned.s
index 750366ff6..c861147aa 100644
--- a/src/core/runtime/blit_shaders/blit_copyAligned.s
+++ b/src/core/runtime/blit_shaders/blit_copyAligned.s
@@ -146,8 +146,12 @@ compute_pgm_rsrc1_vgprs = CopyAlignedRsrc1VGPRs
   s_load_dword    s24, s[0:1], 0x50
   s_waitcnt                lgkmcnt(0)
 
-
+  .if (.amdgcn.gfx_generation_number == 12)
+    s_lshl_b32              s2, ttmp9, 0x6
+  .else
     s_lshl_b32              s2, s2, 0x6
+  .endif
+
     V_ADD_CO_U32            v0, s2, v0
 
     v_mov_b32               v3, s5
diff --git a/src/core/runtime/blit_shaders/blit_copyMisaligned.s b/src/core/runtime/blit_shaders/blit_copyMisaligned.s
index a63b2ace5..48a5b3ec3 100644
--- a/src/core/runtime/blit_shaders/blit_copyMisaligned.s
+++ b/src/core/runtime/blit_shaders/blit_copyMisaligned.s
@@ -117,7 +117,12 @@ CopyMisaligned:
   s_load_dword    s16, s[0:1], 0x30
   s_waitcnt             lgkmcnt(0)
 
-  s_lshl_b32            s2, s2, 0x6
+  .if (.amdgcn.gfx_generation_number == 12)
+    s_lshl_b32          s2, ttmp9, 0x6
+  .else
+    s_lshl_b32          s2, s2, 0x6
+  .endif
+
   V_ADD_CO_U32          v0, s2, v0
 
   v_mov_b32             v3, s5
diff --git a/src/core/runtime/blit_shaders/blit_fill.s b/src/core/runtime/blit_shaders/blit_fill.s
index bdc4fbcc5..752499b4f 100644
--- a/src/core/runtime/blit_shaders/blit_fill.s
+++ b/src/core/runtime/blit_shaders/blit_fill.s
@@ -117,7 +117,12 @@ Fill:
     s_load_dwordx4  s[8:11], s[0:1], 0x10
     s_waitcnt       lgkmcnt(0)
 
-    s_lshl_b32       s2, s2, 0x6
+   .if (.amdgcn.gfx_generation_number == 12)
+     s_lshl_b32      s2, ttmp9, 0x6
+   .else
+     s_lshl_b32      s2, s2, 0x6
+   .endif
+
     V_ADD_CO_U32     v0, s2, v0
 
 .macro mFillPattern iter iter_end
diff --git a/src/core/runtime/default_signal.cpp b/src/core/runtime/default_signal.cpp
index 820fc75ca..b3e5a23f2 100644
--- a/src/core/runtime/default_signal.cpp
+++ b/src/core/runtime/default_signal.cpp
@@ -57,7 +57,7 @@ int BusyWaitSignal::rtti_id_ = 0;
 BusyWaitSignal::BusyWaitSignal(SharedSignal* abi_block, bool enableIPC)
     : Signal(abi_block, enableIPC) {
   signal_.kind = AMD_SIGNAL_KIND_USER;
-  signal_.event_mailbox_ptr = NULL;
+  signal_.event_mailbox_ptr = uint64_t(NULL);
 }
 
 hsa_signal_value_t BusyWaitSignal::LoadRelaxed() {
diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp
index c509fda5e..8ad8ff264 100644
--- a/src/core/runtime/hsa.cpp
+++ b/src/core/runtime/hsa.cpp
@@ -343,7 +343,8 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin
       {"hsa_ven_amd_loader_1_01_pfn_t", sizeof(hsa_ven_amd_loader_1_01_pfn_t)},
       {"hsa_ven_amd_loader_1_02_pfn_t", sizeof(hsa_ven_amd_loader_1_02_pfn_t)},
       {"hsa_ven_amd_loader_1_03_pfn_t", sizeof(hsa_ven_amd_loader_1_03_pfn_t)},
-      {"hsa_ven_amd_aqlprofile_1_00_pfn_t", sizeof(hsa_ven_amd_aqlprofile_1_00_pfn_t)}};
+      {"hsa_ven_amd_aqlprofile_1_00_pfn_t", sizeof(hsa_ven_amd_aqlprofile_1_00_pfn_t)},
+      {"hsa_ven_amd_pc_sampling_1_00_pfn_t", sizeof(hsa_ven_amd_pc_sampling_1_00_pfn_t)}};
   static const size_t num_tables = sizeof(sizes) / sizeof(sizes_t);
 
   if (minor > 99) return 0;
@@ -372,6 +373,9 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin
     case HSA_EXTENSION_AMD_AQLPROFILE:
       name = "hsa_ven_amd_aqlprofile_";
       break;
+    case HSA_EXTENSION_AMD_PC_SAMPLING:
+      name = "hsa_ven_amd_pc_sampling_";
+      break;
     default:
       return 0;
   }
@@ -429,6 +433,21 @@ hsa_status_t hsa_system_get_major_extension_table(uint16_t extension, uint16_t v
     return HSA_STATUS_SUCCESS;
   }
 
+  if (extension == HSA_EXTENSION_AMD_PC_SAMPLING) {
+    if (version_major != core::Runtime::runtime_singleton_->extensions_.pcs_api.version.major_id) {
+      return HSA_STATUS_ERROR;
+    }
+    hsa_ven_amd_pc_sampling_1_00_pfn_t ext_table;
+    ext_table.hsa_ven_amd_pcs_create = hsa_ven_amd_pcs_create;
+    ext_table.hsa_ven_amd_pcs_create_from_id = hsa_ven_amd_pcs_create_from_id;
+    ext_table.hsa_ven_amd_pcs_destroy = hsa_ven_amd_pcs_destroy;
+    ext_table.hsa_ven_amd_pcs_start = hsa_ven_amd_pcs_start;
+    ext_table.hsa_ven_amd_pcs_stop = hsa_ven_amd_pcs_stop;
+    ext_table.hsa_ven_amd_pcs_flush = hsa_ven_amd_pcs_flush;
+
+    memcpy(table, &ext_table, Min(sizeof(ext_table), table_length));
+  }
+
   if (extension == HSA_EXTENSION_FINALIZER) {
     if (version_major !=
         core::Runtime::runtime_singleton_->extensions_.finalizer_api.version.major_id) {
@@ -2195,6 +2214,7 @@ hsa_status_t hsa_executable_create_alt(
   IS_BAD_PTR(executable);
 
   Executable *exec = GetLoader()->CreateExecutable(
+      std::unique_ptr<amd::LoaderContext>(new amd::LoaderContext()),
       profile, options, default_float_rounding_mode);
   CHECK_ALLOC(exec);
 
diff --git a/src/core/runtime/hsa_api_trace.cpp b/src/core/runtime/hsa_api_trace.cpp
index 0c3ba59a7..48dee4858 100644
--- a/src/core/runtime/hsa_api_trace.cpp
+++ b/src/core/runtime/hsa_api_trace.cpp
@@ -80,10 +80,11 @@ void HsaApiTable::Init() {
   // they can add preprocessor macros on the new functions
 
   constexpr size_t expected_core_api_table_size = 1016;
-  constexpr size_t expected_amd_ext_table_size = 560;
+  constexpr size_t expected_amd_ext_table_size = 576;
   constexpr size_t expected_image_ext_table_size = 120;
   constexpr size_t expected_finalizer_ext_table_size = 64;
   constexpr size_t expected_tools_table_size = 64;
+  constexpr size_t expected_pc_sampling_ext_table_size = 72;
 
   static_assert(sizeof(CoreApiTable) == expected_core_api_table_size,
                 "HSA core API table size changed, bump HSA_CORE_API_TABLE_STEP_VERSION and set "
@@ -101,6 +102,9 @@ void HsaApiTable::Init() {
   static_assert(sizeof(ToolsApiTable) == expected_tools_table_size,
                 "HSA tools table size changed, bump HSA_TOOLS_API_TABLE_STEP_VERSION "
                 "and set expected_tools_table_size to the new size of the struct");
+  static_assert(sizeof(PcSamplingExtTable) == expected_pc_sampling_ext_table_size,
+                "HSA finalizer ext table size changed, bump HSA_PC_SAMPLING_API_TABLE_STEP_VERSION "
+                "and set expected_pc_sampling_ext_table_size to the new size of the struct");
 
   // Initialize Version of Api Table
   hsa_api.version.major_id = HSA_API_TABLE_MAJOR_VERSION;
@@ -120,6 +124,7 @@ void HsaApiTable::Init() {
   // of Hsa Runtime initialization, including their major ids
   hsa_api.finalizer_ext_ = NULL;
   hsa_api.image_ext_ = NULL;
+  hsa_api.pc_sampling_ext_ = NULL;
 
   UpdateTools();
   hsa_api.tools_ = &tools_api;
@@ -146,6 +151,13 @@ void HsaApiTable::CloneExts(void* ext_table, uint32_t table_id) {
     hsa_api.image_ext_ = &image_api;
     return;
   }
+
+  // Update HSA Extension PC Sampling Api table
+  if (table_id == HSA_EXT_PC_SAMPLING_API_TABLE_ID) {
+    pcs_api = *reinterpret_cast<PcSamplingExtTable*>(ext_table);
+    hsa_api.pc_sampling_ext_ = &pcs_api;
+    return;
+  }
 }
 
 void HsaApiTable::LinkExts(void* ext_table, uint32_t table_id) {
@@ -165,6 +177,13 @@ void HsaApiTable::LinkExts(void* ext_table, uint32_t table_id) {
     hsa_api.image_ext_ = reinterpret_cast<ImageExtTable*>(ext_table);
     return;
   }
+
+  // Update HSA Extension PC Sampling Api table
+  if (table_id == HSA_EXT_PC_SAMPLING_API_TABLE_ID) {
+    pcs_api = *reinterpret_cast<PcSamplingExtTable*>(ext_table);
+    hsa_api.pc_sampling_ext_ = &pcs_api;
+    return;
+  }
 }
 
 // Update Api table for Hsa Core Runtime
@@ -432,6 +451,7 @@ void HsaApiTable::UpdateAmdExts() {
   amd_ext_api.hsa_amd_portable_export_dmabuf_fn = AMD::hsa_amd_portable_export_dmabuf;
   amd_ext_api.hsa_amd_portable_close_dmabuf_fn = AMD::hsa_amd_portable_close_dmabuf;
   amd_ext_api.hsa_amd_vmem_address_reserve_fn = AMD::hsa_amd_vmem_address_reserve;
+  amd_ext_api.hsa_amd_vmem_address_reserve_align_fn = AMD::hsa_amd_vmem_address_reserve_align;
   amd_ext_api.hsa_amd_vmem_address_free_fn = AMD::hsa_amd_vmem_address_free;
   amd_ext_api.hsa_amd_vmem_handle_create_fn = AMD::hsa_amd_vmem_handle_create;
   amd_ext_api.hsa_amd_vmem_handle_release_fn = AMD::hsa_amd_vmem_handle_release;
@@ -445,6 +465,7 @@ void HsaApiTable::UpdateAmdExts() {
   amd_ext_api.hsa_amd_vmem_get_alloc_properties_from_handle_fn =
       AMD::hsa_amd_vmem_get_alloc_properties_from_handle;
   amd_ext_api.hsa_amd_agent_set_async_scratch_limit_fn = AMD::hsa_amd_agent_set_async_scratch_limit;
+  amd_ext_api.hsa_amd_queue_get_info_fn = AMD::hsa_amd_queue_get_info;
 }
 
 void HsaApiTable::UpdateTools() {
diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp
index 7b5689852..ce8d9256c 100644
--- a/src/core/runtime/hsa_ext_amd.cpp
+++ b/src/core/runtime/hsa_ext_amd.cpp
@@ -761,7 +761,7 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz
   TRY;
   IS_OPEN();
 
-  if (size == 0 || ptr == NULL || (flags > HSA_AMD_MEMORY_POOL_PCIE_FLAG)) {
+  if (size == 0 || ptr == NULL) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
@@ -774,7 +774,11 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz
 
   MemoryRegion::AllocateFlags alloc_flag = core::MemoryRegion::AllocateRestrict;
 
-  if (flags == HSA_AMD_MEMORY_POOL_PCIE_FLAG) alloc_flag |= core::MemoryRegion::AllocatePCIeRW;
+  if (flags & HSA_AMD_MEMORY_POOL_PCIE_FLAG)
+    alloc_flag |= core::MemoryRegion::AllocatePCIeRW;
+
+  if (flags & HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG)
+    alloc_flag |= core::MemoryRegion::AllocateContiguous;
 
 #ifdef SANITIZER_AMDGPU
   alloc_flag |= core::MemoryRegion::AllocateAsan;
@@ -1072,10 +1076,13 @@ hsa_status_t hsa_amd_queue_set_priority(hsa_queue_t* queue,
   core::Queue* cmd_queue = core::Queue::Convert(queue);
   IS_VALID(cmd_queue);
 
+  // Highest queue priority allowed for HSA user is HSA_QUEUE_PRIORITY_HIGH
+  // HSA_QUEUE_PRIORITY_MAXIMUM is reserved for PC Sampling and can only be allocated internally
+  // in ROCR
   static std::map<hsa_amd_queue_priority_t, HSA_QUEUE_PRIORITY> ext_kmt_priomap = {
       {HSA_AMD_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_MINIMUM},
       {HSA_AMD_QUEUE_PRIORITY_NORMAL, HSA_QUEUE_PRIORITY_NORMAL},
-      {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_MAXIMUM},
+      {HSA_AMD_QUEUE_PRIORITY_HIGH, HSA_QUEUE_PRIORITY_HIGH},
   };
 
   auto priority_it = ext_kmt_priomap.find(priority);
@@ -1224,10 +1231,21 @@ hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t addre
   IS_OPEN();
   IS_ZERO(size);
   IS_TRUE(core::Runtime::runtime_singleton_->VirtualMemApiSupported());
-  return core::Runtime::runtime_singleton_->VMemoryAddressReserve(va, size, address, flags);
+  return core::Runtime::runtime_singleton_->VMemoryAddressReserve(va, size, address, 0, flags);
+  CATCH;
+}
+
+hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address,
+                                          uint64_t alignment, uint64_t flags) {
+  TRY;
+  IS_OPEN();
+  IS_ZERO(size);
+  IS_TRUE(core::Runtime::runtime_singleton_->VirtualMemApiSupported());
+  return core::Runtime::runtime_singleton_->VMemoryAddressReserve(va, size, address, alignment, flags);
   CATCH;
 }
 
+
 hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size) {
   TRY;
   IS_OPEN();
@@ -1385,5 +1403,17 @@ hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t _agent, s
   CATCH;
 }
 
+hsa_status_t HSA_API hsa_amd_queue_get_info(hsa_queue_t* _queue,
+                                            hsa_queue_info_attribute_t attribute, void* value) {
+  TRY;
+  IS_OPEN();
+
+  core::Queue* queue = core::Queue::Convert(_queue);
+  IS_VALID(queue);
+
+  return queue->GetInfo(attribute, value);
+  CATCH;
+}
+
 }   //  namespace amd
 }   //  namespace rocr
diff --git a/src/core/runtime/hsa_ext_interface.cpp b/src/core/runtime/hsa_ext_interface.cpp
index 2931b2b54..d872e485b 100644
--- a/src/core/runtime/hsa_ext_interface.cpp
+++ b/src/core/runtime/hsa_ext_interface.cpp
@@ -41,6 +41,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "image/inc/hsa_ext_image_impl.h"
+#include "pcs/inc/hsa_ven_amd_pc_sampling_impl.h"
 #include "core/inc/hsa_ext_interface.h"
 #include "core/inc/runtime.h"
 
@@ -56,6 +57,7 @@ namespace core {
 ExtensionEntryPoints::ExtensionEntryPoints() {
   InitFinalizerExtTable();
   InitImageExtTable();
+  InitPcSamplingExtTable();
   InitAmdExtTable();
 }
 
@@ -99,6 +101,22 @@ void ExtensionEntryPoints::InitImageExtTable() {
   image_api.hsa_ext_image_create_with_layout_fn = hsa_ext_null;
 }
 
+// Initialize PC Sampling function table to be NULLs
+void ExtensionEntryPoints::InitPcSamplingExtTable() {
+  // Initialize Version of Api Table
+  pcs_api.version.major_id = 0x00;
+  pcs_api.version.minor_id = 0x00;
+  pcs_api.version.step_id = 0x00;
+
+  pcs_api.hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ext_null;
+  pcs_api.hsa_ven_amd_pcs_create_fn = hsa_ext_null;
+  pcs_api.hsa_ven_amd_pcs_create_from_id_fn = hsa_ext_null;
+  pcs_api.hsa_ven_amd_pcs_destroy_fn = hsa_ext_null;
+  pcs_api.hsa_ven_amd_pcs_start_fn = hsa_ext_null;
+  pcs_api.hsa_ven_amd_pcs_stop_fn = hsa_ext_null;
+  pcs_api.hsa_ven_amd_pcs_flush_fn = hsa_ext_null;
+}
+
 // Initialize Amd Ext table for Api related to Images
 void ExtensionEntryPoints::InitAmdExtTable() {
   hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = hsa_ext_null;
@@ -131,6 +149,9 @@ void ExtensionEntryPoints::UnloadImage() {
 void ExtensionEntryPoints::Unload() {
   // Reset Image apis to hsa_ext_null function
   UnloadImage();
+#ifdef HSA_PC_SAMPLING_SUPPORT
+  rocr::pcs::ReleasePcSamplingRsrcs();
+#endif
 
   for (auto lib : libs_) {
     void* ptr = os::GetExportAddress(lib, "Unload");
@@ -148,6 +169,7 @@ void ExtensionEntryPoints::Unload() {
   libs_.clear();
 
   InitFinalizerExtTable();
+  InitPcSamplingExtTable();
   InitImageExtTable();
   InitAmdExtTable();
   core::hsa_internal_api_table_.Reset();
@@ -180,6 +202,23 @@ bool ExtensionEntryPoints::LoadImage() {
   return true;
 }
 
+void ExtensionEntryPoints::LoadPcSampling() {
+#ifdef HSA_PC_SAMPLING_SUPPORT
+  if (core::Runtime::runtime_singleton_->flag().disable_pc_sampling()) return;
+
+  // Bind to Image implementation api's
+  rocr::pcs::LoadPcSampling(&pcs_api);
+
+  // Initialize Version of Api Table
+  pcs_api.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION;
+  pcs_api.version.minor_id = sizeof(PcSamplingExtTable);
+  pcs_api.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION;
+
+  // Update private copy of Api table with handle for Image extensions
+  hsa_internal_api_table_.CloneExts(&pcs_api, core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID);
+#endif
+}
+
 bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) {
   os::LibHandle lib = os::LoadLib(library_name);
   if (lib == NULL) {
@@ -429,6 +468,54 @@ hsa_status_t hsa_ext_image_create_with_layout(
                                image);
 }
 
+hsa_status_t HSA_API hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api
+      .hsa_ven_amd_pcs_iterate_configuration_fn(agent, configuration_callback, callback_data);
+}
+
+hsa_status_t HSA_API hsa_ven_amd_pcs_create(
+    hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units,
+    size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_create_fn(
+      agent, method, units, interval, latency, buffer_size, data_ready_callback,
+      client_callback_data, pc_sampling);
+}
+
+hsa_status_t HSA_API hsa_ven_amd_pcs_create_from_id(
+    uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api
+      .hsa_ven_amd_pcs_create_from_id_fn(pcs_id, agent, method, units, interval, latency,
+                                         buffer_size, data_ready_callback, client_callback_data,
+                                         pc_sampling);
+}
+
+hsa_status_t HSA_API hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_destroy_fn(
+      pc_sampling);
+}
+
+hsa_status_t HSA_API hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_start_fn(
+      pc_sampling);
+}
+
+hsa_status_t HSA_API hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_stop_fn(
+      pc_sampling);
+}
+
+hsa_status_t HSA_API hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling) {
+  return rocr::core::Runtime::runtime_singleton_->extensions_.pcs_api.hsa_ven_amd_pcs_flush_fn(
+      pc_sampling);
+}
+
 //---------------------------------------------------------------------------//
 //  Stubs for internal extension functions
 //---------------------------------------------------------------------------//
diff --git a/src/core/runtime/intercept_queue.cpp b/src/core/runtime/intercept_queue.cpp
index 7f82965ee..47598bb95 100644
--- a/src/core/runtime/intercept_queue.cpp
+++ b/src/core/runtime/intercept_queue.cpp
@@ -41,6 +41,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "core/inc/intercept_queue.h"
+#include "core/inc/amd_aql_queue.h"
 #include "core/util/utils.h"
 #include "inc/hsa_api_trace.h"
 
@@ -386,5 +387,18 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
   atomic::Store(&amd_queue_.read_dispatch_id, next_packet_, std::memory_order_release);
 }
 
+hsa_status_t InterceptQueue::GetInfo(hsa_queue_info_attribute_t attribute, void* value) {
+  switch (attribute) {
+    case HSA_AMD_QUEUE_INFO_AGENT:
+    case HSA_AMD_QUEUE_INFO_DOORBELL_ID: {
+      if (!AMD::AqlQueue::IsType(wrapped.get())) return HSA_STATUS_ERROR_INVALID_QUEUE;
+
+      AMD::AqlQueue* aqlQueue = static_cast<AMD::AqlQueue*>(wrapped.get());
+      return aqlQueue->GetInfo(attribute, value);
+    }
+  }
+  return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+}
+
 }  // namespace core
 }  // namespace rocr
diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp
index c87cbc71f..e3d02d3c3 100755
--- a/src/core/runtime/isa.cpp
+++ b/src/core/runtime/isa.cpp
@@ -349,6 +349,8 @@ constexpr size_t hsa_name_size = 63;
   ISAREG_ENTRY_GEN("gfx1103",                11, 0, 3, unsupported, unsupported, 32)
   ISAREG_ENTRY_GEN("gfx1150",                11, 5, 0, unsupported, unsupported, 32)
   ISAREG_ENTRY_GEN("gfx1151",                11, 5, 1, unsupported, unsupported, 32)
+  ISAREG_ENTRY_GEN("gfx1200",                12, 0, 0, unsupported, unsupported, 32)
+  ISAREG_ENTRY_GEN("gfx1201",                12, 0, 1, unsupported, unsupported, 32)
 #undef ISAREG_ENTRY_GEN
   return supported_isas;
 }
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index ceae7333e..9ccef65cc 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -73,6 +73,7 @@
 #include "core/util/os.h"
 #include "core/inc/exceptions.h"
 #include "inc/hsa_ven_amd_aqlprofile.h"
+#include "core/inc/amd_core_dump.hpp"
 
 #ifndef HSA_VERSION_MAJOR
 #define HSA_VERSION_MAJOR 1
@@ -1077,7 +1078,8 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
 
    int connection_fd;
    char buf[IPC_SOCK_SERVER_DMABUF_FD_HANDLE_LENGTH];
-   std::map<uint64_t, int> openDmaBufs;
+   // openDmaBufs pair <int, int> is <dmabuf_fd, ref_count>
+   std::map<uint64_t, std::pair<int, int>> openDmaBufs;
    // Wait until the client has connected
    while (1) {
      connection_fd = accept(ipc_sock_server_fd_, NULL, NULL);
@@ -1095,9 +1097,31 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
      void *baseAddr = NULL;
      size_t memLen = 0;
 
-     ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
      bool isClose = !!(IPC_SOCK_SERVER_CONN_CLOSE_BIT & conn_handle);
+     bool isAlreadyOpen = false;
      conn_handle &= ~(IPC_SOCK_SERVER_CONN_CLOSE_BIT);
+
+     // send dmabufs that are already opened
+     for (auto&conns : openDmaBufs) {
+       if (conn_handle == conns.first) {
+         if (!isClose) {
+           SendDmaBufFd(connection_fd, openDmaBufs[conn_handle].first);
+           openDmaBufs[conn_handle].second++;
+         } else {
+           openDmaBufs[conn_handle].second--;
+           if (!openDmaBufs[conn_handle].second) {
+             close(openDmaBufs[conn_handle].first);
+             openDmaBufs.erase(conn_handle);
+           }
+         }
+         isAlreadyOpen = true;
+         break;
+       }
+     }
+
+     if (isAlreadyOpen) continue;
+
+     ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
      for (auto& conns : ipc_sock_server_conns_) {
        if (conn_handle == conns.first) {
          baseAddr = conns.second.first;
@@ -1105,20 +1129,16 @@ void Runtime::AsyncIPCSockServerConnLoop(void*) {
          break;
        }
      }
-     if (!isClose) {
-       // we can ignore a bad export since importer will catch the bad fd
-       hsaKmtExportDMABufHandle(baseAddr, memLen, &dmabuf_fd, &fragOffset);
-       SendDmaBufFd(connection_fd, dmabuf_fd);
-       openDmaBufs[conn_handle] = dmabuf_fd;
-     } else {
-       close(openDmaBufs[conn_handle]);
-       openDmaBufs.erase(conn_handle);
-     }
+
+     HSAKMT_STATUS err = hsaKmtExportDMABufHandle(baseAddr, memLen, &dmabuf_fd, &fragOffset);
+     if (err != HSAKMT_STATUS_SUCCESS) continue;
+     SendDmaBufFd(connection_fd, dmabuf_fd);
+     openDmaBufs[conn_handle] = std::make_pair(dmabuf_fd, 1);
    }
 
    // Clean up
    for (auto& conns : openDmaBufs)
-     close(conns.second); // close all dangling open dmabuf FDs
+     close(conns.second.first); // close all dangling open dmabuf FDs
    ipc_sock_server_conns_.clear();
    close(ipc_sock_server_fd_);
 }
@@ -1186,6 +1206,17 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
     // System sub allocations are not supported for now.
     if (handle->handle[3] && useFrag) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
+    // Work around to defer export on import call to minimize FD creation.
+    // Without this, a deferred export may fail due to the kernel mode driver not
+    // holding the GEM object reference.
+    // Export the dmabuf then close the file to get the reference to ensure the
+    // deferred export will not run into this problem.
+    int dmabuf_fd;
+    uint64_t fragOffset;
+    HSAKMT_STATUS err = hsaKmtExportDMABufHandle(baseAddr, memLen, &dmabuf_fd, &fragOffset);
+     if (err != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR;
+    close(dmabuf_fd);
+
     ScopedAcquire<KernelMutex> lock(&ipc_sock_server_lock_);
     if (!ipc_sock_server_conns_.size()) { // create new runtime socket server
       struct sockaddr_un address;
@@ -1250,6 +1281,12 @@ static int GetIPCDmaBufFD(uint32_t conn_handle, uint64_t dmabuf_fd_handle, bool
     assert(socket_fd > -1 && "DMA buffer could not be imported for IPC!");
     if (socket_fd == -1) return -1;
 
+    // Set 10 second timeout for ReceiveDmaBufFd
+    struct timeval tv;
+    tv.tv_sec = 10;
+    tv.tv_usec = 0;
+    setsockopt(socket_fd, SOL_SOCKET, SO_RCVTIMEO, (const char*)&tv, sizeof tv);
+
     char buf[IPC_SOCK_SERVER_DMABUF_FD_HANDLE_LENGTH];
     memset(&address, 0, sizeof(struct sockaddr_un));
     memset(buf, 0, sizeof(buf));
@@ -1678,6 +1715,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
 
   hsa_status_t custom_handler_status = HSA_STATUS_ERROR;
   auto system_event_handlers = runtime_singleton_->GetSystemEventHandlers();
+  Agent* faulty_agent = nullptr;
   // If custom handler is registered, pack the fault info and call the handler
   if (!system_event_handlers.empty()) {
     hsa_amd_event_t memory_fault_event;
@@ -1687,7 +1725,7 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
     // Find the faulty agent
     auto it = runtime_singleton_->agents_by_node_.find(fault.NodeId);
     assert(it != runtime_singleton_->agents_by_node_.end() && "Can't find faulty agent.");
-    Agent* faulty_agent = it->second.front();
+    faulty_agent = it->second.front();
     fault_info.agent = Agent::Convert(faulty_agent);
 
     fault_info.virtual_address = fault.VirtualAddress;
@@ -1749,12 +1787,12 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
         reason += "Unknown";
       }
 
-      core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
+      faulty_agent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
 
       fprintf(
           stderr,
           "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n",
-          fault.NodeId, reinterpret_cast<void*>(faultingAgent->public_handle().handle),
+          fault.NodeId, reinterpret_cast<void*>(faulty_agent->public_handle().handle),
           reinterpret_cast<const void*>(fault.VirtualAddress),
           (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str());
 
@@ -1762,6 +1800,16 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
       PrintMemoryMapNear(reinterpret_cast<void*>(fault.VirtualAddress));
 #endif
     }
+    // Fallback if KFD does not support GPU core dump. In this case, there core dump is
+    // generated by hsa-runtime.
+    if (faulty_agent && faulty_agent->isa()->GetMajorVersion() != 11 &&
+        !runtime_singleton_->KfdVersion().supports_core_dump) {
+
+      if (pcs::PcsRuntime::instance()->SessionsActive())
+        fprintf(stderr, "GPU core dump skipped because PC Sampling active\n");
+      else if (amd::coredump::dump_gpu_core())
+        fprintf(stderr, "GPU core dump failed\n");
+    }
     assert(false && "GPU memory access fault.");
     std::abort();
   }
@@ -1953,6 +2001,11 @@ void Runtime::LoadExtensions() {
   extensions_.LoadImage();
   hsa_api_table_.LinkExts(&extensions_.image_api,
                           core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID);
+
+  // Update Hsa Api Table with handle of PCS extension Apis
+  extensions_.LoadPcSampling();
+  hsa_api_table_.LinkExts(&extensions_.pcs_api,
+                          core::HsaApiTable::HSA_EXT_PC_SAMPLING_API_TABLE_ID);
 }
 
 void Runtime::UnloadExtensions() { extensions_.Unload(); }
@@ -2909,18 +2962,23 @@ hsa_status_t Runtime::DmaBufClose(int dmabuf) {
 }
 
 hsa_status_t Runtime::VMemoryAddressReserve(void** va, size_t size, uint64_t address,
-                                            uint64_t flags) {
+                                            uint64_t alignment, uint64_t flags) {
   void* addr = (void*)address;
   HsaMemFlags memFlags = {};
+
+  if (!alignment)
+    alignment = sysconf(_SC_PAGE_SIZE);
+
   ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
 
   memFlags.ui32.OnlyAddress = 1;
   memFlags.ui32.FixedAddress = 1;
+
   /* Try to reserving the VA requested by user */
-  if (hsaKmtAllocMemory(0, size, memFlags, &addr) != HSAKMT_STATUS_SUCCESS) {
+  if (hsaKmtAllocMemoryAlign(0, size, alignment, memFlags, &addr) != HSAKMT_STATUS_SUCCESS) {
     memFlags.ui32.FixedAddress = 0;
     /* Could not reserved VA requested, allocate alternate VA */
-    if (hsaKmtAllocMemory(0, size, memFlags, &addr) != HSAKMT_STATUS_SUCCESS)
+    if (hsaKmtAllocMemoryAlign(0, size, alignment, memFlags, &addr) != HSAKMT_STATUS_SUCCESS)
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
diff --git a/src/core/runtime/signal.cpp b/src/core/runtime/signal.cpp
index eee62f595..065f8ccc9 100644
--- a/src/core/runtime/signal.cpp
+++ b/src/core/runtime/signal.cpp
@@ -52,7 +52,7 @@
 namespace rocr {
 namespace core {
 
-HybridMutex Signal::ipcLock_;
+KernelMutex Signal::ipcLock_;
 std::map<decltype(hsa_signal_t::handle), Signal*> Signal::ipcMap_;
 
 void SharedSignalPool_t::clear() {
@@ -128,7 +128,7 @@ LocalSignal::LocalSignal(hsa_signal_value_t initial_value, bool exportable)
 }
 
 void Signal::registerIpc() {
-  ScopedAcquire<HybridMutex> lock(&ipcLock_);
+  ScopedAcquire<KernelMutex> lock(&ipcLock_);
   auto handle = Convert(this);
   assert(ipcMap_.find(handle.handle) == ipcMap_.end() &&
          "Can't register the same IPC signal twice.");
@@ -136,7 +136,7 @@ void Signal::registerIpc() {
 }
 
 bool Signal::deregisterIpc() {
-  ScopedAcquire<HybridMutex> lock(&ipcLock_);
+  ScopedAcquire<KernelMutex> lock(&ipcLock_);
   if (refcount_ != 0) return false;
   auto handle = Convert(this);
   const auto& it = ipcMap_.find(handle.handle);
@@ -146,14 +146,14 @@ bool Signal::deregisterIpc() {
 }
 
 Signal* Signal::lookupIpc(hsa_signal_t signal) {
-  ScopedAcquire<HybridMutex> lock(&ipcLock_);
+  ScopedAcquire<KernelMutex> lock(&ipcLock_);
   const auto& it = ipcMap_.find(signal.handle);
   if (it == ipcMap_.end()) return nullptr;
   return it->second;
 }
 
 Signal* Signal::duplicateIpc(hsa_signal_t signal) {
-  ScopedAcquire<HybridMutex> lock(&ipcLock_);
+  ScopedAcquire<KernelMutex> lock(&ipcLock_);
   const auto& it = ipcMap_.find(signal.handle);
   if (it == ipcMap_.end()) return nullptr;
   it->second->refcount_++;
diff --git a/src/core/runtime/trap_handler/CMakeLists.txt b/src/core/runtime/trap_handler/CMakeLists.txt
index 2196cb0e9..251572412 100644
--- a/src/core/runtime/trap_handler/CMakeLists.txt
+++ b/src/core/runtime/trap_handler/CMakeLists.txt
@@ -46,8 +46,9 @@ cmake_minimum_required ( VERSION 3.7 )
 find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
 find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
 
-set (TARGET_DEVS "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1030;gfx1100")
-set (POSTFIX "9;940;941;942;1010;10;11")
+set (TARGET_DEVS "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1030;gfx1100;gfx1200")
+set (POSTFIX "9;940;941;942;1010;10;11;12")
+set (SOURCE_SUFFIX ";;;;;;;_gfx12")
 
 if(${CMAKE_VERBOSE_MAKEFILE})
   get_property(clang_path TARGET clang PROPERTY LOCATION)
@@ -92,11 +93,11 @@ endfunction(gen_kernel_bc)
 ##==========================================
 ## Find device code object name and forward to custom command
 ##==========================================
-function(build_kernel TRAP_HANDLER_NAME TARGET_ID POSTFIX)
+function(build_kernel TRAP_HANDLER_NAME TARGET_ID POSTFIX SOURCE_SUFFIX)
 
   ## generate trap handler object code files
   set (CODE_OBJECT_FILE "${TRAP_HANDLER_NAME}_${POSTFIX}")
-  set (TRAP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/trap_handler.s")
+  set (TRAP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/trap_handler${SOURCE_SUFFIX}.s")
   gen_kernel_bc(${TARGET_ID} ${TRAP_FILE} ${CODE_OBJECT_FILE})
 
   ## Build a list of code object file names
@@ -117,10 +118,11 @@ function(build_kernel_for_devices TRAP_HANDLER_NAME)
   foreach(ind RANGE ${dev_count})
     list(GET TARGET_DEVS ${ind} dev)
     list(GET POSTFIX ${ind} post)
+    list(GET SOURCE_SUFFIX ${ind} suffix)
     if(${CMAKE_VERBOSE_MAKEFILE})
       message("\n  Generating: ${dev} ...")
     endif()
-    build_kernel(${TRAP_HANDLER_NAME} ${dev} ${post})
+    build_kernel(${TRAP_HANDLER_NAME} ${dev} ${post} "${suffix}")
   endforeach(ind)
 
   set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
diff --git a/src/core/runtime/trap_handler/trap_handler.s b/src/core/runtime/trap_handler/trap_handler.s
index 0936786c5..3933c926e 100644
--- a/src/core/runtime/trap_handler/trap_handler.s
+++ b/src/core/runtime/trap_handler/trap_handler.s
@@ -47,12 +47,14 @@
 .set SQ_WAVE_PC_HI_TRAP_ID_SIZE              , 8
 .set SQ_WAVE_PC_HI_TRAP_ID_BFE               , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
 .set SQ_WAVE_STATUS_HALT_SHIFT               , 13
+.set SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT   , 18
 .set SQ_WAVE_STATUS_HALT_BFE                 , (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16))
 .set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT          , 8
 .set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT      , 11
 .set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT       , 28
 .set SQ_WAVE_TRAPSTS_MATH_EXCP               , 0x7F
 .set SQ_WAVE_MODE_EXCP_EN_SHIFT              , 12
+.set SQ_WAVE_MODE_EXCP_EN_SIZE               , 8
 .set TRAP_ID_ABORT                           , 2
 .set TRAP_ID_DEBUGTRAP                       , 3
 .set DOORBELL_ID_SIZE                        , 10
@@ -64,6 +66,7 @@
 .set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0       , (1 << (DOORBELL_ID_SIZE + 4))
 .set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0     , (1 << (DOORBELL_ID_SIZE + 5))
 
+.set TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT    , 31
 .set TTMP6_WAVE_STOPPED_SHIFT                , 30
 .set TTMP6_SAVED_STATUS_HALT_SHIFT           , 29
 .set TTMP6_SAVED_STATUS_HALT_MASK            , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
@@ -90,21 +93,34 @@
 
 .if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
   .set TTMP11_TTMPS_SETUP_SHIFT              , 31
+
+  // Bit to indicate that this is a hosttrap trap instead of stochastic trap
+  // Currently not used
+  .set TTMP13_PCS_IS_STOCHASTIC              , 24
 .endif
 
 // ABI between first and second level trap handler:
-//   ttmp0 = PC[31:0]
+//   ttmp0  = PC[31:0]
+//   ttmp8  = WorkgroupIdX
+//   ttmp9  = WorkgroupIdY
+//   ttmp10 = WorkgroupIdZ
 //   ttmp12 = SQ_WAVE_STATUS
 //   ttmp14 = TMA[31:0]
 //   ttmp15 = TMA[63:32]
 // gfx9:
 //   ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
-// gfx906/gfx908/gfx90a:
-//   ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
+// all gfx9 (except gfx940, gfx941, gfx942):
+//   ttmp6 = 0[6:0], DispatchPktIndx[24:0]
+//   ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveInWg[5:0]
+//            Note: Once stochastic sampling is implemented, L2 Trap Handler will use Bit 23
+//            (TTMP11_PCS_IS_STOCHASTIC) to differentiate between stochastic and hosttrap
 // gfx940/gfx941/gfx942:
+//   ttmp11 = 0[0], DispatchPktIndx[24:0], WaveIdInWg[5:0]
 //   ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
 // gfx10:
 //   ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
+// gfx10/gfx11:
+//   ttmp6 = 0[6:0], DispatchPktIndx[24:0]
 // gfx1010:
 //   ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
 // gfx1030/gfx1100:
@@ -115,6 +131,31 @@ trap_entry:
   s_bfe_u32            ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
   s_cbranch_scc0       .no_skip_debugtrap
 
+.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) // PC_SAMPLING_GFX9
+  // ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, ttmp13
+  // Check if this is a host-trap. For now, if so, that means we are sampling
+  //
+  // TMA2 layout:
+  //   [0x00] out_buf_t* host_trap_buffers;
+  //   [0x08] out_buf_t* stochastic_trap_buffers;
+  //
+  // --- Start profile trap handlers GFX9 --- //
+  //  if (host_trap) {
+  //    if (stochastic)       // Not implemented yet
+  //        ttmp11.bit23 = 1; // Not implemented yet
+  //    profiling_trap_handler(tma->host_trap_buffers);
+  //  }
+
+  s_bitcmp1_b32       ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
+  s_cbranch_scc0      .not_host_trap_gfx9
+  s_load_dwordx2      ttmp[14:15], ttmp[14:15], 0 glc   // ttmp[14:15]=&host_trap_buffers
+  // TODO: When implementing stochastic sampling, need to set TTMP11_PCS_IS_STOCHASTIC
+  // or TTMP13_PCS_IS_STOCHASTIC to differentiate between hosttrap and stochastic sampling
+  s_waitcnt           lgkmcnt(0)
+  s_branch            .profile_trap_handlers_gfx9       // Off to the profile handlers
+
+.not_host_trap_gfx9:
+.endif // PC_SAMPLING_GFX9
   // If caused by s_trap then advance PC.
   s_bitcmp1_b32        ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
   s_cbranch_scc1       .not_s_trap
@@ -134,7 +175,253 @@ trap_entry:
 
   // Ignore llvm.debugtrap.
   s_branch             .exit_trap
+.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) // PC_SAMPLING_GFX9
+  // tma->host_trap_buffers Offsets:
+  //    [0x00]	uint64_t buf_write_val;
+  //    [0x08]	uint32_t buf_size;
+  //    [0x0c]	uint32_t reserved0;
+  //    [0x10]	uint32_t buf_written_val0;
+  //    [0x14]	uint32_t buf_watermark0;
+  //    [0x18]  hsa_signal_t done_sig0;
+  //    [0x20]	uint32_t buf_written_val1;
+  //    [0x24]	uint32_t buf_watermark1;
+  //    [0x28]	hsa_signal_t done_sig1;
+  //    [0x30]	uint8_t  reserved1[16];
+  //    [0x40]	sample_t buffer0[buf_size];
+  //    [0x40+(buf_size*sizeof(sample_t))]sample_t buffer1[buf_size];
+  //
+  //__global__ void profiling_trap_handler(out_buf_t* tma) {
+  //  uint64_t local_entry = atomicAdd(&tma->buf_write_val, 1);
+  //  int buf_to_use = local_entry >> 63;
+  //  local_entry &= (ULLONG_MAX >> 1);
+  //
+  //  if (local_entry < tma->buf_size) {
+  //    sample_t *buf_base = buf_to_use ? tma->buffer1 : tma->buffer0;
+  //    fill_sample(&buf_base[local_entry]); // reads TTMP11 as well
+  //
+  //    uint32_t * written = buf_to_use ? &(tma->buf_written_val1) :
+  //                                      &(tma->buf_written_val0);
+  //
+  //    uint64_t done = __atomic_fetch_add(&written, 1,
+  //                memory_order_release, memory_scope_system);
+  //
+  //    uint32_t watermark = buf_to_use ? tma->buf_watermark0 :
+  //                                      tma->buf_watermark1;
+  //    if (done == watermark) {
+  //       hsa_signal_t done_sig = buf_to_use ? tma->done_sig1 :
+  //                                            tma->done_sig0;
+  //       send_signal(done_sig);
+  //    }
+  //  }
+  //}
+
+  // ttmp[14:15] is tma->host_trap_buffers; Available: ttmp[2:3], ttmp[4:5], ttmp7, ttmp13
+.profile_trap_handlers_gfx9:
+  s_mov_b64             ttmp[2:3], 1                    // atomic increment buf_write_val
+  s_atomic_add_x2       ttmp[2:3], ttmp[14:15], glc     // ttmp[2:3] = packed local_entry
+  s_load_dword          ttmp13, ttmp[14:15], 0x8        // ttmp13 = tma->buf_size
+  s_waitcnt             lgkmcnt(0)
+  s_lshr_b32            ttmp7, ttmp3, 31                // ttmp7 = buf_to_use
+  s_bitset0_b32         ttmp6, 31                       // clear out ttmp6 bit31
+  s_cmp_eq_u32          ttmp7, 0                        // store off buf_to_use ...
+  s_cbranch_scc1        .skip_ttmp6_set_gfx9            // into bit31 of ttmp6
+  s_bitset1_b32         ttmp6, 31
+.skip_ttmp6_set_gfx9:
+  s_bfe_u64             ttmp[2:3], ttmp[2:3], (63<<16)  // ttmp[2:3] = new local_entry
+  s_cmp_lg_u32          ttmp3, 0                        // if entry >= 2^32, always lost
+  s_cbranch_scc1        .pc_sampling_exit
+  s_cmp_ge_u32          ttmp2, ttmp13                   // if local_entry >= buf_size
+  s_cbranch_scc1        .pc_sampling_exit
+
+  // ttmp2=local_entry, ttmp7=buf_to_use (also in bit31 of ttmp6), ttmp13=buf_size
+  // ttmp[14:15] is tma->host_trap_buffers. Available: ttmp3, ttmp[4:5]
+  s_mul_i32             ttmp13, ttmp13, ttmp7           // ttmp[4:5]=buf_size if ...
+  s_mul_i32             ttmp4, ttmp13, 0x40             // buf_to_use=1, 0 otherwise
+  s_mul_hi_u32          ttmp5, ttmp13, 0x40
+
+  s_add_u32             ttmp4, ttmp4, 0x40              // now ttmp[4:5]=offset from ...
+  s_addc_u32            ttmp5, ttmp5, 0                 // tma to start of target buffer;
+  s_add_u32             ttmp4, ttmp14, ttmp4            // ttmp[4:5] now points to ...
+  s_addc_u32            ttmp5, ttmp15, ttmp5            // buffer0 or buffer1
+  s_mov_b32             ttmp7, ttmp2
+
+  // ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX",
+  // ttmp[14:15] holds 'tma->host_trap_buffers' pointer
+  // ttmp[2:3] and ttmp13 are available for gathering perf sample info
+  // ttmp[14:15] is live out
+
+  // fill_sample(...) - begin //
+  // typedef struct {
+  // [0x00]  uint64_t pc;
+  // [0x08]  uint64_t exec_mask;
+  // [0x10]  uint32_t workgroup_id_x;
+  // [0x14]  uint32_t workgroup_id_y;
+  // [0x18]  uint32_t workgroup_id_z;
+  // [0x1c]  uint32_t wave_in_wg : 6;
+  //         uint32_t chiplet    : 3;    // Currently not used
+  //         uint32_t reserved   : 23;
+  // [0x20]  uint32_t hw_id;
+  // [0x24]  uint32_t reserved0;
+  // [0x28]  uint64_t reserved1;
+  // [0x30]  uint64_t timestamp;
+  // [0x38]  uint64_t correlation_id;
+  // } perf_sample_hosttrap_v1_t;
+  //
+  // __device__ void fill_sample_hosttrap_v1(perf_sample_hosttrap_v1_t* buf) {
+  //    buf->pc = ((ttmp1 & 0xffff) << 32) | ttmp0;
+  //    buf->exec_mask = EXEC;
+  //    buf->workgroup_id_x = ttmp8;
+  //    buf->workgroup_id_y = ttmp9;
+  //    buf->workgroup_id_z = ttmp10;
+  //    buf->chiplet_and_wave_id = ttmp11 & 0x3f;
+  //    buf->hw_id = s_getreg_b32(HW_REG_HW_ID);
+  //    buf->timestamp = s_memrealtime;
+  //    buf->correlation_id = get_correlation_id();
+  // }
+
+  s_mul_i32             ttmp2, ttmp7, 0x40              // offset into buffer for 64B objects
+  s_mul_hi_u32          ttmp3, ttmp7, 0x40              // ttmp[2:3] will contain byte ...
+  s_add_u32             ttmp2, ttmp2, ttmp4
+  s_addc_u32            ttmp3, ttmp3, ttmp5             // ttmp[2:3]=&bufferX[local_entry]
+  s_memrealtime         ttmp[4:5]
+  s_and_b32             ttmp1, ttmp1, 0xffff            // clear out extra data from PC_HI
+  s_store_dwordx2       ttmp[0:1], ttmp[2:3]            // store PC
+  s_waitcnt             lgkmcnt(0)                      // wait for timestamp
+  s_mov_b32             ttmp13, exec_lo
+  s_store_dword         ttmp13, ttmp[2:3], 0x8          // store EXEC_LO
+  s_mov_b32             ttmp13, exec_hi
+  s_store_dword         ttmp13, ttmp[2:3], 0xc          // store EXEC_HI
+  s_store_dwordx2       ttmp[8:9], ttmp[2:3], 0x10      // store wg_id_x and wg_id_y
+  s_store_dword         ttmp10, ttmp[2:3], 0x18         // store wg_id_z
+  s_store_dwordx2       ttmp[4:5], ttmp[2:3], 0x30      // store timestamp
+  s_and_b32             ttmp4, ttmp11, 0x3f
+  s_store_dword         ttmp4, ttmp[2:3], 0x1c          // store wave_in_wg
+
+  // Get HW_ID using S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and HW_ID = 4 (0x4)
+  s_getreg_b32          ttmp4, hwreg(HW_REG_HW_ID)
+  s_store_dword         ttmp4, ttmp[2:3], 0x20          // store HW_ID
+
+  // ttmp[2:3] = &buffer[local_entry]; ttmp[4:5], ttmp7, and ttmp13 are free
+  // ttmp[14:15] = tma->host_trap_buffers and is live out; ttmp6.b31 is buf_to_use, 0 or 1
+
+  // get_correlation_id() -- begin //
+  // Returns a value to use as a correlation ID.
+  // Returns a 64bit number made up of the 9-bit queue ID and the
+  // 25-bit dispatch_pkt concatenated together as:
+  // Upper 32 bits: {23 0s}{9b queue_id}
+  // Lower 32 bits: { 7 0s}{25b dispatch_pkt}
+  // __device__ uint64_t get_correlation_id() {
+  //   uint64_t output;
+  //   // Get bottom 10 bits of queue's doorbell, in doorbell region.
+  //   // Doorbell is 8B (3b per); region is 8K (13b total) so 10 bits.
+  //   output = s_sendmsg(MSG_GET_DOORBELL);
+  //   output &= 0x3ff;
+  //   output <<= 32;
+  //   // TTMP6 contains this packet dispatch ID modulus the queue size
+  //   output |= TTMP6;
+  //   return output;
+  // }
+
+  // ttmp[2:3] = &buffer[local_entry]
+  // ttmp[4:5], ttmp7, and ttmp13 are free
+  // ttmp[14:15] = tma->host_trap_buffers and is live out
+  // ttmp6.b31 is buf_to_use, 0 or 1 and is live out
+  s_mov_b64             ttmp[4:5], exec                 // back up EXEC mask
+  s_mov_b32             exec_lo, 0x80000000             // prepare EXEC for doorbell spin
+  s_sendmsg             sendmsg(MSG_GET_DOORBELL)       // message 10, puts doorbell in EXEC
+.wait_for_doorbell:
+  s_nop                 0x7                             // wait a bit for message to return
+  s_bitcmp0_b32         exec_lo, 0x1f                   // returned message  will 0 bit 31
+  s_cbranch_scc0        .wait_for_doorbell              // wait some more if no data yet
+  s_mov_b32             exec_hi, ttmp5                  // do not care about message[63:32]
+  s_and_b32             ttmp5, exec_lo, DOORBELL_ID_MASK // doorbell now in ttmp5
+  s_mov_b32             exec_lo, ttmp4                  // exec mask restored
+  s_and_b32             ttmp4, ttmp6, 0x1ffffff         // extract low 25 bits from ttmp6 (DispatchPktIndx[24:0])
+                                                        // ttmp[4:5] is correlation ID
+  s_store_dwordx2       ttmp[4:5], ttmp[2:3], 0x38      // store correlation_id to sample
+  // get_correlation_id() -- end //
+
+  // complete stores before returning
+  s_dcache_wb
+  s_waitcnt             lgkmcnt(0)
+  // fill_sample(...) - end //
+
+  // ttmp[2:3], ttmp[4:5], ttmp7, and ttmp13 are free
+  // ttmp[14:15] = tma->host_trap_buffers; ttmp6.b31 is buf_to_use, 0 or 1
+  s_lshr_b32            ttmp13, ttmp6, 31               // ttmp13 is buf_to_use
+  s_mulk_i32            ttmp13, 0x10
+                                                        // written_val0 to written_val_X
+  s_add_u32             ttmp14, ttmp14, ttmp13          // now ttmp[14:15] points to ...
+  s_addc_u32            ttmp15, ttmp15, 0x0             // buf_written_valX-0x10
+  s_mov_b32             ttmp7, 1                        // atomic increment buf_written_valX
+  s_atomic_add          ttmp7, ttmp[14:15], 0x10 glc    // ttmp7 will contain 'done'
+  s_load_dword          ttmp13, ttmp[14:15], 0x14       // ttmp13 will hold watermark
+  s_waitcnt             lgkmcnt(0)
+  s_cmp_lg_u32          ttmp7, ttmp13                   // if 'done' not at watermark, exit
+  s_cbranch_scc1        .pc_sampling_exit
+
+  // ttmp[2:3], [4:5], ttmp7, and ttmp13 are free
+  // ttmp[14:15] = buf_written_valX-0x10
+
+  // send_signal(...) - begin //
+  //__device__ void send_signal(hsa_signal_t* signal) {
+  //
+  //   amd_signal_t *sig = (amd_signal_t *)signal->handle;
+  //   __atomic_store(&(sig->value), 0, memory_order_relaxed, memory_scope_system);
+  //   if (sig->event_mailbox_ptr != NULL && sig->event_id != NULL) {
+  //     uint32_t id = sig->event_id;
+  //     __atomic_store(sig->event_mailbox_ptr, id,
+  //            memory_order_relaxed, memory_scope_system);
+  //     __builtin_amdgcn_s_sendmsg(1, id);
+  //   }
+  //}
+  // We jump to the trap handler exit after this, so no live-out registers except
+  // those that must survive the trap handler
+
+  s_load_dwordx2        ttmp[2:3], ttmp[14:15], 0x18    // load done_sig into ttmp[2:3]
+  s_waitcnt             lgkmcnt(0)                      // it's actually an amd_signal_t*
+  s_load_dwordx2        ttmp[4:5], ttmp[2:3], 0x10      // load event mailbox ptr into 4:5
+  s_load_dword          ttmp7, ttmp[2:3], 0x18          // load event_id into ttmp7
+  s_mov_b64             ttmp[14:15], 0
+  s_store_dwordx2       ttmp[14:15], ttmp[2:3], 0x8 glc // zero out signal value
+  s_waitcnt             lgkmcnt(0)                      // wait for value store to complete
+  s_cmp_eq_u64          ttmp[4:5], 0
+  s_cbranch_scc1        .pc_sampling_exit               // null mailbox means no interrupt
+  s_cmp_eq_u32          ttmp7, 0
+  s_cbranch_scc1        .pc_sampling_exit               // event_id zero means no interrupt
+  s_store_dword         ttmp7, ttmp[4:5] glc            // send event ID to the mailbox
+  s_waitcnt             lgkmcnt(0)
+  s_mov_b32             ttmp13, m0                      // save off m0
+  s_mov_b32             m0, ttmp7                       // put ID into message payload
+  s_nop                 0x0                             // Manually inserted wait states
+  s_sendmsg             sendmsg(MSG_INTERRUPT)          // send interrupt message
+  s_waitcnt             lgkmcnt(0)                      // wait for message to be sent
+  s_mov_b32             m0, ttmp13                      // restore m0
+  // send_signal(...) - end //
+.pc_sampling_exit:
+  // We can receive regular exceptions while doing PC-Sampling so we need to make sure we
+  // handle these exceptions here
+  s_getreg_b32          ttmp2, hwreg(HW_REG_TRAPSTS)
+  s_getreg_b32          ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN
+  // Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR
+  s_or_b32              ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT)
+  s_getreg_b32          ttmp2, hwreg(HW_REG_TRAPSTS)
+  s_and_b32             ttmp2, ttmp2, ttmp3
+  // SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR
+  s_cbranch_scc1        .no_skip_debugtrap               // if any of those are set, handle exceptions
+
+  // Check for maskable exceptions
+  s_getreg_b32          ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE)
+  s_and_b32             ttmp3, ttmp2, ttmp3
+  s_cbranch_scc1        .no_skip_debugtrap
+
+  // Since we are in PC sampling, it is safe to ignore watch1/2/3 and single step
+  // as those should only be enabled by the debugger.
+  // We could add them for completeness, i.e. check MODE.DEBUG_EN (bit 11)
+  // and "MODE.EXCP_EN.WATCH (bit 19) && (TRAPSTS.EXCP_HI.ADDR_WATCH1 (bit 12) || TRAPSTS.EXCP_HI.ADDR_WATCH2 (bit 13) || TRAPSTS.EXCP_HI.ADDR_WATCH3 (bit 14)).
+  s_branch              .exit_trap
 
+.endif // PC_SAMPLING_GFX9
 .no_skip_debugtrap:
   // Save trap id and halt status in ttmp6.
   s_andn2_b32          ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
@@ -243,12 +530,16 @@ trap_entry:
   // Halt the wavefront upon restoring STATUS below.
   s_bitset1_b32        ttmp6, TTMP6_WAVE_STOPPED_SHIFT
   s_bitset1_b32        ttmp12, SQ_WAVE_STATUS_HALT_SHIFT
+  // Set WAVE.SKIP_EXPORT as a maker so the debugger knows the trap handler was
+  // entered and has decided to halt the wavee.
+  s_bitset1_b32        ttmp12, SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT
 
 .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4)
   s_bitcmp1_b32        ttmp11, TTMP11_TTMPS_SETUP_SHIFT
   s_cbranch_scc1       .ttmps_initialized
   s_mov_b32            ttmp4, 0
   s_mov_b32            ttmp5, 0
+  s_bitset0_b32        ttmp6, TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT
   s_bitset1_b32        ttmp11, TTMP11_TTMPS_SETUP_SHIFT
 .ttmps_initialized:
 .endif
@@ -273,8 +564,8 @@ trap_entry:
 .endif
 
   // Restore SQ_WAVE_STATUS.
-  s_and_b64            exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
-  s_and_b64            vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
+  s_and_b64            exec, exec, exec               // restore STATUS.EXECZ, not writable by s_setreg_b32
+  s_and_b64            vcc, vcc, vcc                  // restore STATUS.VCCZ, not writable by s_setreg_b32
   s_setreg_b32         hwreg(HW_REG_STATUS), ttmp12
 
   // Return to original (possibly modified) PC.
diff --git a/src/core/runtime/trap_handler/trap_handler_gfx12.s b/src/core/runtime/trap_handler/trap_handler_gfx12.s
new file mode 100644
index 000000000..2289c57d7
--- /dev/null
+++ b/src/core/runtime/trap_handler/trap_handler_gfx12.s
@@ -0,0 +1,226 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Trap Handler V2 source
+.set DOORBELL_ID_SIZE                          , 10
+.set DOORBELL_ID_MASK                          , ((1 << DOORBELL_ID_SIZE) - 1)
+.set EC_QUEUE_WAVE_ABORT_M0                    , (1 << (DOORBELL_ID_SIZE + 0))
+.set EC_QUEUE_WAVE_TRAP_M0                     , (1 << (DOORBELL_ID_SIZE + 1))
+.set EC_QUEUE_WAVE_MATH_ERROR_M0               , (1 << (DOORBELL_ID_SIZE + 2))
+.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0      , (1 << (DOORBELL_ID_SIZE + 3))
+.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0         , (1 << (DOORBELL_ID_SIZE + 4))
+.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0       , (1 << (DOORBELL_ID_SIZE + 5))
+.set SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT      , 4
+.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT           , 7
+.set SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT , 6
+.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT  , 8
+.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT    , 0
+.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE     , 6
+.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_SHIFT         , 0
+.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_SIZE          , 6
+.set SQ_WAVE_PC_HI_ADDRESS_MASK                , 0xFFFF
+.set SQ_WAVE_PC_HI_TRAP_ID_BFE                 , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
+.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT               , 28
+.set SQ_WAVE_PC_HI_TRAP_ID_SIZE                , 4
+.set SQ_WAVE_STATE_PRIV_HALT_BFE               , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16))
+.set SQ_WAVE_STATE_PRIV_HALT_SHIFT             , 14
+.set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT , 2
+.set TRAP_ID_ABORT                             , 2
+.set TRAP_ID_DEBUGTRAP                         , 3
+.set TTMP6_SAVED_STATUS_HALT_MASK              , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
+.set TTMP6_SAVED_STATUS_HALT_SHIFT             , 29
+.set TTMP6_SAVED_TRAP_ID_BFE                   , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
+.set TTMP6_SAVED_TRAP_ID_MASK                  , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
+.set TTMP6_SAVED_TRAP_ID_SHIFT                 , 25
+.set TTMP6_SAVED_TRAP_ID_SIZE                  , 4
+.set TTMP6_WAVE_STOPPED_SHIFT                  , 30
+.set TTMP8_DEBUG_FLAG_SHIFT                    , 31
+.set TTMP11_DEBUG_ENABLED_SHIFT                , 23
+.set TTMP_PC_HI_SHIFT                          , 7
+
+// ABI between first and second level trap handler:
+//   { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0]
+//   ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0]
+//   ttmp12 = SQ_WAVE_STATE_PRIV
+//   ttmp14 = TMA[31:0]
+//   ttmp15 = TMA[63:32]
+
+trap_entry:
+  // Branch if not a trap (an exception instead).
+  s_bfe_u32            ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
+  s_cbranch_scc0       .no_skip_debugtrap
+
+  // If caused by s_trap then advance PC.
+  s_add_u32            ttmp0, ttmp0, 0x4
+  s_addc_u32           ttmp1, ttmp1, 0x0
+
+.not_s_trap:
+  // If llvm.debugtrap and debugger is not attached.
+  s_cmp_eq_u32         ttmp2, TRAP_ID_DEBUGTRAP
+  s_cbranch_scc0       .no_skip_debugtrap
+
+  s_bitcmp0_b32        ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
+  s_cbranch_scc0       .no_skip_debugtrap
+
+  // Ignore llvm.debugtrap.
+  s_branch             .exit_trap
+
+.no_skip_debugtrap:
+  // Save trap id and halt status in ttmp6.
+  s_andn2_b32          ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
+  s_min_u32            ttmp2, ttmp2, 0xF
+  s_lshl_b32           ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
+  s_or_b32             ttmp6, ttmp6, ttmp2
+  s_bfe_u32            ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE
+  s_lshl_b32           ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
+  s_or_b32             ttmp6, ttmp6, ttmp2
+
+  // Fetch doorbell id for our queue.
+  s_sendmsg_rtn_b32    ttmp3, sendmsg(MSG_RTN_GET_DOORBELL)
+  s_wait_kmcnt         0
+  s_and_b32            ttmp3, ttmp3, DOORBELL_ID_MASK
+
+  s_getreg_b32	       ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV)
+
+  s_bitcmp1_b32        ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT
+  s_cbranch_scc0       .not_memory_violation
+  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
+
+  // Aperture violation requires XNACK_ERROR == 0.
+  s_branch             .not_aperture_violation
+
+.not_memory_violation:
+  s_bitcmp1_b32        ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT
+  s_cbranch_scc0       .not_aperture_violation
+  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
+
+.not_aperture_violation:
+  s_bitcmp1_b32        ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
+  s_cbranch_scc0       .not_illegal_instruction
+  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
+
+.not_illegal_instruction:
+  s_getreg_b32         ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE)
+  s_cbranch_scc0       .not_math_exception
+  s_getreg_b32         ttmp10, hwreg(HW_REG_TRAP_CTRL, SQ_WAVE_TRAP_CTRL_MATH_EXCP_SHIFT, SQ_WAVE_TRAP_CTRL_MATH_EXCP_SIZE)
+  s_and_b32            ttmp2, ttmp2, ttmp10
+
+  s_cbranch_scc0       .not_math_exception
+  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
+
+.not_math_exception:
+  s_bfe_u32            ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE
+  s_cmp_eq_u32         ttmp2, TRAP_ID_ABORT
+  s_cbranch_scc0       .not_abort_trap
+  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
+
+.not_abort_trap:
+  // If no other exception was flagged then report a generic error.
+  s_andn2_b32          ttmp2, ttmp3, DOORBELL_ID_MASK
+  s_cbranch_scc1       .send_interrupt
+  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
+
+.send_interrupt:
+  // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
+  s_mov_b32            ttmp2, m0
+  s_mov_b32            m0, ttmp3
+  s_nop                0x0 // Manually inserted wait states
+  s_sendmsg            sendmsg(MSG_INTERRUPT)
+  // Wait for the message to go out.
+  s_wait_kmcnt         0
+  s_mov_b32            m0, ttmp2
+
+  // Parking the wave requires saving the original pc in the preserved ttmps.
+  // Register layout before parking the wave:
+  //
+  // ttmp10: ?[31:0]
+  // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
+  //
+  // After parking the wave:
+  //
+  // ttmp10: pc_lo[31:0]
+  // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
+  //
+  // Save the PC
+  s_mov_b32            ttmp10, ttmp0
+  s_and_b32            ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
+  s_lshl_b32           ttmp1, ttmp1, TTMP_PC_HI_SHIFT
+  s_andn2_b32          ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
+  s_or_b32             ttmp11, ttmp11, ttmp1
+
+  // Park the wave
+  s_getpc_b64          [ttmp0, ttmp1]
+  s_add_u32            ttmp0, ttmp0, .parked - .
+  s_addc_u32           ttmp1, ttmp1, 0x0
+
+.halt_wave:
+  // Halt the wavefront upon restoring STATUS below.
+  s_bitset1_b32        ttmp6, TTMP6_WAVE_STOPPED_SHIFT
+  s_bitset1_b32        ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT
+
+  // Initialize TTMP registers
+  s_bitcmp1_b32        ttmp8, TTMP8_DEBUG_FLAG_SHIFT
+  s_cbranch_scc1       .ttmps_initialized
+  s_mov_b32            ttmp4, 0
+  s_mov_b32            ttmp5, 0
+  s_bitset1_b32        ttmp8, TTMP8_DEBUG_FLAG_SHIFT
+.ttmps_initialized:
+
+.exit_trap:
+  // Restore SQ_WAVE_STATUS.
+  s_and_b64            exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+  s_and_b64            vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
+  s_setreg_b32         hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12
+  s_lshr_b32           ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1)
+  s_setreg_b32         hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12
+
+  // Return to original (possibly modified) PC.
+  s_rfe_b64            [ttmp0, ttmp1]
+
+.parked:
+  s_trap               0x2
+  s_branch             .parked
+
+// Add s_code_end padding so instruction prefetch always has something to read.
+.rept (256 - ((. - trap_entry) % 64)) / 4
+  s_code_end
+.endr
diff --git a/src/core/util/flag.h b/src/core/util/flag.h
index 43d28bf34..f720b4c4d 100644
--- a/src/core/util/flag.h
+++ b/src/core/util/flag.h
@@ -67,6 +67,7 @@ class Flag {
   // Lift limit for 2.10 release RCCL workaround. This limit is not used when asynchronous scratch
   // reclaim is supported
   const size_t DEFAULT_SCRATCH_SINGLE_LIMIT = 146800640;  // small_limit >> 2;
+  const size_t DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE = 256 * 1024 * 1024;
 
   explicit Flag() { Refresh(); }
 
@@ -184,6 +185,9 @@ class Flag {
     var = os::GetEnvVar("HSA_DISABLE_IMAGE");
     disable_image_ = (var == "1") ? true : false;
 
+    var = os::GetEnvVar("HSA_DISABLE_PC_SAMPLING");
+    disable_pc_sampling_ = (var == "1") ? true : false;
+
     var = os::GetEnvVar("HSA_LOADER_ENABLE_MMAP_URI");
     loader_enable_mmap_uri_ = (var == "1") ? true : false;
 
@@ -228,6 +232,14 @@ class Flag {
     var = os::GetEnvVar("HSA_ENABLE_IPC_MODE_LEGACY");
     enable_ipc_mode_legacy_ = (var == "1") ? true : true; // Temporarily always enable
 
+    if (os::IsEnvVarSet("HSA_PCS_MAX_DEVICE_BUFFER_SIZE")) {
+      var = os::GetEnvVar("HSA_PCS_MAX_DEVICE_BUFFER_SIZE");
+      char* end;
+      pc_sampling_max_device_buffer_size_ = strtoul(var.c_str(), &end, 10);
+    } else {
+      pc_sampling_max_device_buffer_size_ = DEFAULT_PCS_MAX_DEVICE_BUFFER_SIZE;
+    }
+
     // Temporary environment variable to disable CPU affinity override
     // Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely.
     var = os::GetEnvVar("HSA_OVERRIDE_CPU_AFFINITY_DEBUG");
@@ -297,6 +309,8 @@ class Flag {
 
   bool disable_image() const { return disable_image_; }
 
+  bool disable_pc_sampling() const { return disable_pc_sampling_; }
+
   bool loader_enable_mmap_uri() const { return loader_enable_mmap_uri_; }
 
   size_t force_sdma_size() const { return force_sdma_size_; }
@@ -336,6 +350,8 @@ class Flag {
 
   bool enable_ipc_mode_legacy() const { return enable_ipc_mode_legacy_; }
 
+  size_t pc_sampling_max_device_buffer_size() const { return pc_sampling_max_device_buffer_size_; }
+
  private:
   bool check_flat_scratch_;
   bool enable_vm_fault_message_;
@@ -353,6 +369,7 @@ class Flag {
   bool no_scratch_reclaim_;
   bool no_scratch_thread_limit_;
   bool disable_image_;
+  bool disable_pc_sampling_;
   bool loader_enable_mmap_uri_;
   bool check_sramecc_validity_;
   bool debug_;
@@ -390,6 +407,8 @@ class Flag {
 
   SRAMECC_ENABLE sramecc_enable_;
 
+  size_t pc_sampling_max_device_buffer_size_;
+
   // Map GPU index post RVD to its default cu mask.
   std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
 
diff --git a/src/core/util/lazy_ptr.h b/src/core/util/lazy_ptr.h
index e2a847b5c..2aef6a3bf 100644
--- a/src/core/util/lazy_ptr.h
+++ b/src/core/util/lazy_ptr.h
@@ -59,7 +59,7 @@ template <typename T> class lazy_ptr {
  public:
   lazy_ptr() {}
 
-  explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
+  explicit lazy_ptr(std::function<T*()> Constructor) { reset(Constructor); }
 
   lazy_ptr(lazy_ptr&& rhs) {
     obj = std::move(rhs.obj);
diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp
index b13c907b8..d36dc0d5d 100644
--- a/src/core/util/lnx/os_linux.cpp
+++ b/src/core/util/lnx/os_linux.cpp
@@ -108,13 +108,20 @@ class os_thread {
       err = pthread_attr_setstacksize(&attrib, stackSize);
       if (err != 0) {
         fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err));
-        return;
+        err = pthread_attr_destroy(&attrib);
+        if (err != 0) {
+          fprintf(stderr, "pthread_attr_destroy failed: %s\n", strerror(err));
+          return;
+        }
       }
     }
 
+    int cores = 0;
+    cpu_set_t* cpuset = nullptr;
+
     if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) {
-      int cores = get_nprocs_conf();
-      cpu_set_t* cpuset = CPU_ALLOC(cores);
+      cores = get_nprocs_conf();
+      cpuset = CPU_ALLOC(cores);
       if (cpuset == nullptr) {
         fprintf(stderr, "CPU_ALLOC failed: %s\n", strerror(errno));
         return;
@@ -126,7 +133,7 @@ class os_thread {
       err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
       CPU_FREE(cpuset);
       if (err != 0) {
-        fprintf(stderr, "pthread_attr_setaffinity_np failed: %s\n", strerror(err));
+        fprintf(stderr, "pthread_setaffinity_np failed: %s\n", strerror(err));
         return;
       }
     }
@@ -642,11 +649,20 @@ SharedMutex CreateSharedMutex() {
     fprintf(stderr, "rw lock attribute init failed: %s\n", strerror(err));
     return nullptr;
   }
+
+#ifdef __GLIBC__
   err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
   if (err != 0) {
     fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err));
     return nullptr;
   }
+#else
+  err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  if (err != 0) {
+    fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
+#endif
 
   pthread_rwlock_t* lock = new pthread_rwlock_t;
   err = pthread_rwlock_init(lock, &attrib);
diff --git a/src/core/util/utils.h b/src/core/util/utils.h
index ab536ba79..1a454d7bd 100644
--- a/src/core/util/utils.h
+++ b/src/core/util/utils.h
@@ -74,8 +74,7 @@ static __forceinline void* _aligned_malloc(size_t size, size_t alignment) {
   return aligned_alloc(alignment, size);
 #else
   void *mem = NULL;
-  if (NULL != posix_memalign(&mem, alignment, size))
-    return NULL;
+  if (0 != posix_memalign(&mem, alignment, size)) return NULL;
   return mem;
 #endif
 }
diff --git a/src/hsacore.so.def b/src/hsacore.so.def
index 4d9c92186..dd9b554a1 100644
--- a/src/hsacore.so.def
+++ b/src/hsacore.so.def
@@ -234,6 +234,7 @@ global:
 	hsa_amd_portable_export_dmabuf;
 	hsa_amd_portable_close_dmabuf;
 	hsa_amd_vmem_address_reserve;
+	hsa_amd_vmem_address_reserve_align;
 	hsa_amd_vmem_address_free;
 	hsa_amd_vmem_handle_create;
 	hsa_amd_vmem_handle_release;
@@ -252,7 +253,14 @@ global:
 	hsa_tools_scratch_event_free_end;
 	hsa_tools_scratch_event_async_reclaim_start;
 	hsa_tools_scratch_event_async_reclaim_end;
-
+	hsa_ven_amd_pcs_iterate_configuration;
+	hsa_ven_amd_pcs_create;
+	hsa_ven_amd_pcs_create_from_id;
+	hsa_ven_amd_pcs_destroy;
+	hsa_ven_amd_pcs_start;
+	hsa_ven_amd_pcs_stop;
+	hsa_ven_amd_pcs_flush;
+	hsa_amd_queue_get_info;
 local:
     *;
 };
diff --git a/src/image/addrlib/inc/addrinterface.h b/src/image/addrlib/inc/addrinterface.h
index 5260426b6..39dd282c7 100644
--- a/src/image/addrlib/inc/addrinterface.h
+++ b/src/image/addrlib/inc/addrinterface.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -36,9 +19,8 @@
 #include "addrtypes.h"
 
 namespace rocr {
-
-#define ADDRLIB_VERSION_MAJOR 6
-#define ADDRLIB_VERSION_MINOR 2
+#define ADDRLIB_VERSION_MAJOR 8
+#define ADDRLIB_VERSION_MINOR 10
 #define ADDRLIB_VERSION ((ADDRLIB_VERSION_MAJOR << 16) | ADDRLIB_VERSION_MINOR)
 
 /// Virtually all interface functions need ADDR_HANDLE as first parameter
@@ -47,6 +29,13 @@ typedef VOID*   ADDR_HANDLE;
 /// Client handle used in callbacks
 typedef VOID*   ADDR_CLIENT_HANDLE;
 
+typedef struct _ADDR_EXTENT3D
+{
+    UINT_32  width;
+    UINT_32  height;
+    UINT_32  depth;  // also slices for 2D images
+} ADDR_EXTENT3D;
+
 /**
 * /////////////////////////////////////////////////////////////////////////////////////////////////
 * //                                  Callback functions
@@ -124,7 +113,7 @@ typedef union _ADDR_CHANNEL_SETTING
     struct
     {
         UINT_8 valid   : 1;    ///< Indicate whehter this channel setting is valid
-        UINT_8 channel : 2;    ///< 0 for x channel, 1 for y channel, 2 for z channel
+        UINT_8 channel : 2;    ///< 0 for x channel, 1 for y channel, 2 for z channel, 3 for MSAA sample index
         UINT_8 index   : 5;    ///< Channel index
     };
     UINT_8 value;              ///< Value
@@ -158,18 +147,29 @@ typedef union _ADDR_EQUATION_KEY
 * @brief address equation structure
 ****************************************************************************************************
 */
-#define ADDR_MAX_EQUATION_BIT 20u
+#define ADDR_MAX_LEGACY_EQUATION_COMP 3u
+#define ADDR_MAX_EQUATION_COMP        5u
+#define ADDR_MAX_EQUATION_BIT         20u
 
 // Invalid equation index
 #define ADDR_INVALID_EQUATION_INDEX 0xFFFFFFFF
 
 typedef struct _ADDR_EQUATION
 {
-    ADDR_CHANNEL_SETTING addr[ADDR_MAX_EQUATION_BIT];  ///< addr setting
-                                                       ///< each bit is result of addr ^ xor ^ xor2
-    ADDR_CHANNEL_SETTING xor1[ADDR_MAX_EQUATION_BIT];  ///< xor setting
-    ADDR_CHANNEL_SETTING xor2[ADDR_MAX_EQUATION_BIT];  ///< xor2 setting
+    union
+    {
+        struct {
+            ADDR_CHANNEL_SETTING addr[ADDR_MAX_EQUATION_BIT];  ///< addr setting
+            ADDR_CHANNEL_SETTING xor1[ADDR_MAX_EQUATION_BIT];  ///< xor setting
+            ADDR_CHANNEL_SETTING xor2[ADDR_MAX_EQUATION_BIT];  ///< xor2 setting
+            ADDR_CHANNEL_SETTING xor3[ADDR_MAX_EQUATION_BIT];  ///< xor3 setting
+            ADDR_CHANNEL_SETTING xor4[ADDR_MAX_EQUATION_BIT];  ///< xor4 setting
+        };
+        ///< Components showing the sources of each bit; each bit is result of addr ^ xor ^ xor2...
+        ADDR_CHANNEL_SETTING comps[ADDR_MAX_EQUATION_COMP][ADDR_MAX_EQUATION_BIT];
+    };
     UINT_32              numBits;                      ///< The number of bits in equation
+    UINT_32              numBitComponents;             ///< The max number of channels contributing to a bit
     BOOL_32              stackedDepthSlices;           ///< TRUE if depth slices are treated as being
                                                        ///< stacked vertically prior to swizzling
 } ADDR_EQUATION;
@@ -1723,6 +1723,30 @@ typedef enum _AddrSwizzleGenOption
     ADDR_SWIZZLE_GEN_LINEAR     = 1,    ///< Using a linear increment of swizzle
 } AddrSwizzleGenOption;
 
+/**
+****************************************************************************************************
+*   AddrBlockType
+*
+*   @brief
+*       Macro define resource block type
+****************************************************************************************************
+*/
+typedef enum
+{
+    AddrBlockLinear = 0, // Resource uses linear swizzle mode
+    AddrBlockMicro = 1, // Resource uses 256B block
+    AddrBlockThin4KB = 2, // Resource uses thin 4KB block
+    AddrBlockThick4KB = 3, // Resource uses thick 4KB block
+    AddrBlockThin64KB = 4, // Resource uses thin 64KB block
+    AddrBlockThick64KB = 5, // Resource uses thick 64KB block
+    AddrBlockThinVar = 6, // Resource uses thin var block
+    AddrBlockThickVar = 7, // Resource uses thick var block
+    AddrBlockMaxTiledType,
+
+    AddrBlockThin256KB = AddrBlockThinVar,
+    AddrBlockThick256KB = AddrBlockThickVar,
+} AddrBlockType;
+
 /**
 ****************************************************************************************************
 *   AddrSwizzleOption
@@ -2408,7 +2432,8 @@ typedef union _ADDR2_SURFACE_FLAGS
         UINT_32 metaRbUnaligned   :  1; ///< This resource has rb unaligned metadata
         UINT_32 metaPipeUnaligned :  1; ///< This resource has pipe unaligned metadata
         UINT_32 view3dAs2dArray   :  1; ///< This resource is a 3D resource viewed as 2D array
-        UINT_32 reserved          : 13; ///< Reserved bits
+        UINT_32 allowExtEquation  :  1; ///< If unset, only legacy DX eqs are allowed (2 XORs)
+        UINT_32 reserved          : 12; ///< Reserved bits
     };
 
     UINT_32 value;
@@ -2585,7 +2610,7 @@ typedef struct _ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
 {
     UINT_32    size;             ///< Size of this structure in bytes
 
-    UINT_64    addr;             ///< Byte address
+    UINT_64    addr;             ///< Byte offset from the image starting address
     UINT_32    bitPosition;      ///< Bit position within surfaceAddr, 0-7.
                                  ///  For surface bpp < 8, e.g. FMT_1.
     UINT_32    prtBlockIndex;    ///< Index of a PRT tile (64K block)
@@ -3691,7 +3716,7 @@ typedef struct _ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT
     AddrResourceType      resourceType;      ///< Surface type
     AddrFormat            format;            ///< Surface format
     UINT_32               width;             ///< Width of mip0 in texels (not in compressed block)
-    UINT_32               height;            ///< Height of mip0 in texels (not in compressed block) 
+    UINT_32               height;            ///< Height of mip0 in texels (not in compressed block)
     UINT_32               numSlices;         ///< Number surface slice/depth of mip0
     UINT_32               numMipLevels;      ///< Total mipmap levels.
     UINT_32               pipeBankXor;       ///< Combined swizzle used to do bank/pipe rotation
@@ -3924,6 +3949,20 @@ ADDR_E_RETURNCODE ADDR_API Addr2GetPreferredSurfaceSetting(
     const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
     ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut);
 
+/**
+****************************************************************************************************
+*   Addr2GetPossibleSwizzleModes
+*
+*   @brief
+*       Returns a list of swizzle modes that are valid from the hardware's perspective for the
+*       client to choose from
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr2GetPossibleSwizzleModes(
+    ADDR_HANDLE                                   hLib,
+    const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+    ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut);
+
 /**
 ****************************************************************************************************
 *   Addr2IsValidDisplaySwizzleMode
@@ -3938,6 +3977,520 @@ ADDR_E_RETURNCODE ADDR_API Addr2IsValidDisplaySwizzleMode(
     UINT_32         bpp,
     BOOL_32         *pResult);
 
-} // rocr
+/**
+****************************************************************************************************
+*   Addr2GetAllowedBlockSet
+*
+*   @brief
+*       Returns the set of allowed block sizes given the allowed swizzle modes and resource type
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedBlockSet(
+    ADDR_HANDLE      hLib,
+    ADDR2_SWMODE_SET allowedSwModeSet,
+    AddrResourceType rsrcType,
+    ADDR2_BLOCK_SET* pAllowedBlockSet);
+
+/**
+****************************************************************************************************
+*   Addr2GetAllowedSwSet
+*
+*   @brief
+*       Returns the set of allowed swizzle types given the allowed swizzle modes
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedSwSet(
+    ADDR_HANDLE       hLib,
+    ADDR2_SWMODE_SET  allowedSwModeSet,
+    ADDR2_SWTYPE_SET* pAllowedSwSet);
+
+/**
+****************************************************************************************************
+*   Addr2IsBlockTypeAvailable
+*
+*   @brief
+*       Determine whether a block type is allowed in a given blockSet
+****************************************************************************************************
+*/
+BOOL_32 Addr2IsBlockTypeAvailable(ADDR2_BLOCK_SET blockSet, AddrBlockType blockType);
+
+/**
+****************************************************************************************************
+*   Addr2BlockTypeWithinMemoryBudget
+*
+*   @brief
+*       Determine whether a new block type is acceptable based on memory waste ratio. Will favor
+*       larger block types.
+****************************************************************************************************
+*/
+BOOL_32 Addr2BlockTypeWithinMemoryBudget(
+    UINT_64 minSize,
+    UINT_64 newBlockTypeSize,
+    UINT_32 ratioLow,
+    UINT_32 ratioHi,
+#if defined(__cplusplus)
+    DOUBLE  memoryBudget = 0.0f,
+    BOOL_32 newBlockTypeBigger = TRUE);
+#else
+    DOUBLE  memoryBudget,
+    BOOL_32 newBlockTypeBigger);
+#endif
+
+/**
+****************************************************************************************************
+*   ADDR3_SURFACE_FLAGS
+*
+*   @brief
+*       Surface flags
+****************************************************************************************************
+*/
+typedef union _ADDR3_SURFACE_FLAGS
+{
+    struct
+    {
+        UINT_32 color              : 1; ///< This resource is a color buffer, can be used with RTV
+        UINT_32 depth              : 1; ///< This resource is a depth buffer, can be used with DSV
+        UINT_32 stencil            : 1; ///< This resource is a stencil buffer, can be used with DSV
+        UINT_32 texture            : 1; ///< This resource can be used with SRV
+        UINT_32 unordered          : 1; ///< This resource can be used with UAV
+        UINT_32 hiZHiS             : 1;
+        UINT_32 blockCompressed    : 1;
+        UINT_32 nv12               : 1;
+        UINT_32 p010               : 1;
+        UINT_32 view3dAs2dArray    : 1;
+        UINT_32 isVrsImage         : 1; ///< This resource is a VRS source image
+        UINT_32 reserved           : 21; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR3_SURFACE_FLAGS;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SURFACE_INFO_INPUT
+*
+*   @brief
+*       Input structure for Addr3ComputeSurfaceInfo
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SURFACE_INFO_INPUT
+{
+    UINT_32               size;              ///< Size of this structure in bytes
+
+    ADDR3_SURFACE_FLAGS   flags;             ///< Surface flags
+    Addr3SwizzleMode      swizzleMode;       ///< Swizzle Mode for Gfx12
+    AddrResourceType      resourceType;      ///< Surface type
+    AddrFormat            format;            ///< Surface format
+    UINT_32               bpp;               ///< bits per pixel
+    UINT_32               width;             ///< Width (of mip0), in pixels
+    UINT_32               height;            ///< Height (of mip0), in pixels
+    UINT_32               numSlices;         ///< Number surface slice/depth (of mip0),
+    UINT_32               numMipLevels;      ///< Total mipmap levels.
+    UINT_32               numSamples;        ///< Number of samples
+    UINT_32               pitchInElement;    ///< Pitch in elements (blocks for compressed formats)
+    UINT_32               sliceAlign;        ///< Required slice size in bytes
+} ADDR3_COMPUTE_SURFACE_INFO_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_MIP_INFO
+*
+*   @brief
+*       Structure that contains information for mip level
+*
+****************************************************************************************************
+*/
+typedef struct _ADDR3_MIP_INFO
+{
+    UINT_32             pitch;              ///< Pitch in elements
+    UINT_32             height;             ///< Padded height in elements
+    UINT_32             depth;              ///< Padded depth
+    UINT_32             pixelPitch;         ///< Pitch in pixels
+    UINT_32             pixelHeight;        ///< Padded height in pixels
+    UINT_32             equationIndex;      ///< Equation index in the equation table
+    UINT_64             offset;             ///< Offset in bytes from mip base, should only be used
+                                            ///< to setup vam surface descriptor, can't be used
+                                            ///< to setup swizzle pattern
+    UINT_64             macroBlockOffset;   ///< macro block offset in bytes from mip base
+    UINT_32             mipTailOffset;      ///< mip tail offset in bytes
+    UINT_32             mipTailCoordX;      ///< mip tail coord x
+    UINT_32             mipTailCoordY;      ///< mip tail coord y
+    UINT_32             mipTailCoordZ;      ///< mip tail coord z
+} ADDR3_MIP_INFO;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SURFACE_INFO_OUTPUT
+*
+*   @brief
+*       Output structure for Addr3ComputeSurfaceInfo
+*   @note
+        Element: AddrLib unit for computing. e.g. BCn: 4x4 blocks; R32B32B32: 32bit with 3x pitch
+        Pixel: Original pixel
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SURFACE_INFO_OUTPUT
+{
+    UINT_32             size;                 ///< Size of this structure in bytes
+    UINT_32             pitch;                ///< Pitch in elements (blocks for compressed formats)
+    UINT_32             pixelPitch;           ///< Pitch in original pixels
+    UINT_32             pixelHeight;          ///< Height in original pixels
+    UINT_32             pixelBits;            ///< Original bits per pixel, passed from input
+    UINT_32             bpp;                  ///< Bits per elements
+                                              ///  (e.g. blocks for BCn, 1/3 for 96bit)
+    UINT_32             numSlices;            ///< Padded depth for 3d resource
+                                              ///  or padded number of slices for 2d array resource
+    UINT_32             height;               ///< Padded height (of mip0) in elements
+    UINT_64             sliceSize;            ///< Slice (total mip chain) size in bytes
+    UINT_64             surfSize;             ///< Surface (total mip chain) size in bytes
+    UINT_32             baseAlign;            ///< Base address alignment
+    ADDR_EXTENT3D       blockExtent;          ///< Dimensions in element inside one block
+    UINT_32             pixelMipChainPitch;   ///< Mip chain pitch in original pixels
+    UINT_32             pixelMipChainHeight;  ///< Mip chain height in original pixels
+    ADDR3_MIP_INFO*     pMipInfo;             ///< Info regarding the start, sizes of the mip levels
+    BOOL_32             mipChainInTail;       ///< If whole mipchain falls into mip tail block
+    UINT_32             firstMipIdInTail;     ///< The id of first mip in tail, if there is no mip
+                                              ///  in tail, it will be set to number of mip levels
+} ADDR3_COMPUTE_SURFACE_INFO_OUTPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_SWMODE_SET
+*
+*   @brief
+*       Bit field that defines swizzle type
+****************************************************************************************************
+*/
+// The bit order MUST be the same as Addr3SwizzleMode enumerations, otherwise using bitset to enable
+// or disable swizzle modes will be problematic.
+typedef union _ADDR3_SWMODE_SET
+{
+    struct
+    {
+        UINT_32 swLinear    :  1;
+        UINT_32 sw2d256B    :  1;
+        UINT_32 sw2d4kB     :  1;
+        UINT_32 sw2d64kB    :  1;
+        UINT_32 sw2d256kB   :  1;
+        UINT_32 sw3d4kB     :  1;
+        UINT_32 sw3d64kB    :  1;
+        UINT_32 sw3d256kB   :  1;
+        UINT_32 reserved    : 24;
+    };
+
+    UINT_32 value;
+} ADDR3_SWMODE_SET;
+
+/**
+****************************************************************************************************
+*   ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT
+*
+*   @brief
+*       Input structure of Addr3GetPossibleSwizzleModes
+****************************************************************************************************
+*/
+typedef struct _ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT
+{
+    UINT_32               size;              ///< Size of this structure in bytes
+
+    ADDR3_SURFACE_FLAGS   flags;             ///< Surface flags
+    AddrResourceType      resourceType;      ///< Surface type
+    UINT_32               bpp;               ///< bits per pixel
+    UINT_32               width;             ///< Width (of mip0), in pixels
+    UINT_32               height;            ///< Height (of mip0), in pixels
+    UINT_32               numSlices;         ///< Number surface slice/depth (of mip0),
+    UINT_32               numMipLevels;      ///< Total mipmap levels.
+    UINT_32               numSamples;        ///< Number of samples
+    UINT_32               maxAlign;          ///< maximum base/size alignment requested by client
+} ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT
+*
+*   @brief
+*       Output structure of Addr3GetPossibleSwizzleModes
+****************************************************************************************************
+*/
+typedef struct _ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT
+{
+    UINT_32           size;             ///< Size of this structure in bytes
+    ADDR3_SWMODE_SET  validModes;       ///< List of valid swizzle modes for this function.
+} ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT;
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSurfaceInfo
+*
+*   @brief
+*       Compute surface width/height/slices/alignments and suitable tiling mode
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceInfo(
+    ADDR_HANDLE                              hLib,
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn,
+    ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*       pOut);
+
+/**
+****************************************************************************************************
+*   Addr3GetPossibleSwizzleModes
+*
+*   @brief
+*       Returns a list of swizzle modes that are valid from the hardware's perspective for the
+*       client to choose from
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3GetPossibleSwizzleModes(
+    ADDR_HANDLE                                  hLib,
+    const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT* pIn,
+    ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT*      pOut);
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for Addr3ComputeSurfaceAddrFromCoord
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT
+{
+    UINT_32             size;            ///< Size of this structure in bytes
+
+    UINT_32             x;               ///< X coordinate
+    UINT_32             y;               ///< Y coordinate
+    UINT_32             slice;           ///< Slice index
+    UINT_32             sample;          ///< Sample index, use fragment index for EQAA
+    UINT_32             mipId;           ///< the mip ID in mip chain
+
+    Addr3SwizzleMode    swizzleMode;     ///< Swizzle mode for Gfx12
+    ADDR3_SURFACE_FLAGS flags;           ///< Surface flags
+    AddrResourceType    resourceType;    ///< Surface type
+    UINT_32             bpp;             ///< Bits per pixel
+    ADDR_EXTENT3D       unAlignedDims;   ///< Surface original dimensions (of mip0)
+    UINT_32             numMipLevels;    ///< Total mipmap levels
+    UINT_32             numSamples;      ///< Number of samples
+    UINT_32             pitchInElement;  ///< Pitch in elements (blocks for compressed formats)
+} ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for Addr3ComputeSurfaceAddrFromCoord
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32    size;             ///< Size of this structure in bytes
+
+    UINT_64    addr;             ///< Byte offset from the image starting address
+    UINT_32    bitPosition;      ///< Bit position within surfaceAddr, 0-7.
+                                 ///  For surface bpp < 8, e.g. FMT_1.
+    UINT_32    prtBlockIndex;    ///< Index of a PRT tile (64K block)
+} ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT;
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address from a given coordinate.
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceAddrFromCoord(
+    ADDR_HANDLE                                         hLib,
+    const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT*    pIn,
+    ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*         pOut);
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_PIPEBANKXOR_INPUT
+*
+*   @brief
+*       Input structure of Addr3ComputePipebankXor
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_PIPEBANKXOR_INPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    UINT_32             surfIndex;          ///< Input surface index
+    Addr3SwizzleMode    swizzleMode;        ///< Surface swizzle mode
+} ADDR3_COMPUTE_PIPEBANKXOR_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT
+*
+*   @brief
+*       Output structure of Addr3ComputePipebankXor
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    UINT_32             pipeBankXor;        ///< Pipe bank xor
+} ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT;
+
+/**
+****************************************************************************************************
+*   Addr3ComputePipeBankXor
+*
+*   @brief
+*       Calculate a valid bank pipe xor value for client to use.
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputePipeBankXor(
+    ADDR_HANDLE                            hLib,
+    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
+    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut);
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT
+*
+*   @brief
+*       Input structure of Addr3ComputeNonBlockCompressedView
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT
+{
+    UINT_32               size;              ///< Size of this structure in bytes
+    ADDR3_SURFACE_FLAGS   flags;             ///< Surface flags
+    Addr3SwizzleMode      swizzleMode;       ///< Swizzle Mode for Gfx12
+    AddrResourceType      resourceType;      ///< Surface type
+    AddrFormat            format;            ///< Surface format
+    ADDR_EXTENT3D         unAlignedDims;     ///< Surface original dimensions (of mip0)
+    UINT_32               numMipLevels;      ///< Total mipmap levels.
+    UINT_32               pipeBankXor;       ///< Combined swizzle used to do bank/pipe rotation
+    UINT_32               slice;             ///< Index of slice to view
+    UINT_32               mipId;             ///< Id of mip to view
+} ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT
+*
+*   @brief
+*       Output structure of Addr3ComputeNonBlockCompressedView
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    UINT_64             offset;             ///< Offset from resource base for the view
+    UINT_32             pipeBankXor;        ///< Pipe bank xor for the view
+    ADDR_EXTENT3D       unAlignedDims;      ///< Mip0 dimens (in element) for the view
+    UINT_32             numMipLevels;       ///< Total mipmap levels for the view
+    UINT_32             mipId;              ///< Mip ID for the view
+} ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT;
+
+/**
+****************************************************************************************************
+*   Addr3ComputeNonBlockCompressedView
+*
+*   @brief
+*       Compute non-block-compressed view for a given mipmap level/slice
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeNonBlockCompressedView(
+    ADDR_HANDLE                                       hLib,
+    const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,
+    ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut);
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT
+*
+*   @brief
+*       Input structure of Addr3ComputeSubResourceOffsetForSwizzlePattern
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    Addr3SwizzleMode    swizzleMode;        ///< Surface swizzle mode
+    AddrResourceType    resourceType;       ///< Surface resource type
+    UINT_32             pipeBankXor;        ///< Per resource xor
+    UINT_32             slice;              ///< Slice id
+    UINT_64             sliceSize;          ///< Slice size of a mip chain
+    UINT_64             macroBlockOffset;   ///< Macro block offset, returned in ADDR3_MIP_INFO
+    UINT_32             mipTailOffset;      ///< Mip tail offset, returned in ADDR3_MIP_INFO
+} ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT
+*
+*   @brief
+*       Output structure of Addr3ComputeSubResourceOffsetForSwizzlePattern
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    UINT_64             offset;             ///< offset
+} ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT;
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSubResourceOffsetForSwizzlePattern
+*
+*   @brief
+*       Calculate sub resource offset to support swizzle pattern.
+****************************************************************************************************
+*/
+VOID ADDR_API Addr3ComputeSubResourceOffsetForSwizzlePattern(
+    ADDR_HANDLE                                                     hLib,
+    const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,
+    ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut);
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT
+*
+*   @brief
+*       Input structure of Addr2ComputeSlicePipeBankXor
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    Addr3SwizzleMode    swizzleMode;        ///< Surface swizzle mode
+    AddrResourceType    resourceType;       ///< Surface resource type
+    UINT_32             bpe;                ///< bits per element (e.g. block size for BCn format)
+    UINT_32             basePipeBankXor;    ///< Base pipe bank xor
+    UINT_32             slice;              ///< Slice id
+    UINT_32             numSamples;         ///< Number of samples
+} ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT;
+
+/**
+****************************************************************************************************
+*   ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT
+*
+*   @brief
+*       Output structure of Addr3ComputeSlicePipeBankXor
+****************************************************************************************************
+*/
+typedef struct _ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+    UINT_32             pipeBankXor;        ///< Pipe bank xor
+} ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT;
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSlicePipeBankXor
+*
+*   @brief
+*       Calculate slice pipe bank xor value based on base pipe bank xor and slice id.
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeSlicePipeBankXor(
+    ADDR_HANDLE                                  hLib,
+    const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,
+    ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut);
 
+} // namespace rocr
 #endif // __ADDR_INTERFACE_H__
diff --git a/src/image/addrlib/inc/addrtypes.h b/src/image/addrlib/inc/addrtypes.h
index ccecc2473..aa1b48873 100644
--- a/src/image/addrlib/inc/addrtypes.h
+++ b/src/image/addrlib/inc/addrtypes.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -292,6 +275,26 @@ typedef enum _AddrSwizzleMode
     ADDR_SW_256KB_R_X       = ADDR_SW_MISCDEF31,
 } AddrSwizzleMode;
 
+/**
+****************************************************************************************************
+* @brief
+*   Neutral enums that define swizzle modes for Gfx12+ ASIC
+*
+****************************************************************************************************
+*/
+typedef enum _Addr3SwizzleMode
+{
+    ADDR3_LINEAR    = 0,
+    ADDR3_256B_2D   = 1,
+    ADDR3_4KB_2D    = 2,
+    ADDR3_64KB_2D   = 3,
+    ADDR3_256KB_2D  = 4,
+    ADDR3_4KB_3D    = 5,
+    ADDR3_64KB_3D   = 6,
+    ADDR3_256KB_3D  = 7,
+    ADDR3_MAX_TYPE  = 8,
+} Addr3SwizzleMode;
+
 /**
 ****************************************************************************************************
 * @brief
@@ -454,6 +457,7 @@ typedef enum _AddrFormat {
     ADDR_FMT_ASTC_12x12                           = 0x0000004d,
     ADDR_FMT_ETC2_64BPP                           = 0x0000004e,
     ADDR_FMT_ETC2_128BPP                          = 0x0000004f,
+    ADDR_FMT_BG_RG_16_16_16_16                    = 0x00000050,
 } AddrFormat;
 
 /**
diff --git a/src/image/addrlib/src/addrinterface.cpp b/src/image/addrlib/src/addrinterface.cpp
index d1ebf2680..0bc83678d 100644
--- a/src/image/addrlib/src/addrinterface.cpp
+++ b/src/image/addrlib/src/addrinterface.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -32,6 +15,7 @@
 #include "addrinterface.h"
 #include "addrlib1.h"
 #include "addrlib2.h"
+#include "addrlib3.h"
 
 #include "addrcommon.h"
 
@@ -1796,7 +1780,377 @@ ADDR_E_RETURNCODE ADDR_API Addr2IsValidDisplaySwizzleMode(
         returnCode = ADDR_ERROR;
     }
 
-  return returnCode;
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr2GetPossibleSwizzleModes
+*
+*   @brief
+*       Returns a list of swizzle modes that are valid from the hardware's perspective for the
+*       client to choose from
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr2GetPossibleSwizzleModes(
+    ADDR_HANDLE                                   hLib, ///< handle of addrlib
+    const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,  ///< [in] input
+    ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut) ///< [out] output
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    V2::Lib* pLib = V2::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->GetPossibleSwizzleModes(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+/**
+****************************************************************************************************
+*   Addr2GetAllowedBlockSet
+*
+*   @brief
+*       Returns the set of allowed block sizes given the allowed swizzle modes and resource type
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedBlockSet(
+    ADDR_HANDLE      hLib,              ///< handle of addrlib
+    ADDR2_SWMODE_SET allowedSwModeSet,  ///< [in] allowed swizzle modes
+    AddrResourceType rsrcType,          ///< [in] resource type
+    ADDR2_BLOCK_SET* pAllowedBlockSet)  ///< [out] allowed block sizes
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    V2::Lib* pLib = V2::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->GetAllowedBlockSet(allowedSwModeSet, rsrcType, pAllowedBlockSet);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr2GetAllowedSwSet
+*
+*   @brief
+*       Returns the set of allowed swizzle types given the allowed swizzle modes
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr2GetAllowedSwSet(
+    ADDR_HANDLE       hLib,              ///< handle of addrlib
+    ADDR2_SWMODE_SET  allowedSwModeSet,  ///< [in] allowed swizzle modes
+    ADDR2_SWTYPE_SET* pAllowedSwSet)     ///< [out] allowed swizzle types
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    V2::Lib* pLib = V2::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->GetAllowedSwSet(allowedSwModeSet, pAllowedSwSet);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr2IsBlockTypeAvailable
+*
+*   @brief
+*       Determine whether a block type is allowed in a given blockSet
+****************************************************************************************************
+*/
+BOOL_32 Addr2IsBlockTypeAvailable(
+    ADDR2_BLOCK_SET blockSet,
+    AddrBlockType   blockType)
+{
+    BOOL_32 avail;
+
+    if (blockType == AddrBlockLinear)
+    {
+        avail = blockSet.linear ? TRUE : FALSE;
+    }
+    else
+    {
+        avail = blockSet.value & (1 << (static_cast<UINT_32>(blockType) - 1)) ? TRUE : FALSE;
+    }
+
+    return avail;
+}
+
+/**
+****************************************************************************************************
+*   Addr2BlockTypeWithinMemoryBudget
+*
+*   @brief
+*       Determine whether a new block type is acceptable based on memory waste ratio. Will favor
+*       larger block types.
+****************************************************************************************************
+*/
+BOOL_32 Addr2BlockTypeWithinMemoryBudget(
+    UINT_64 minSize,
+    UINT_64 newBlockTypeSize,
+    UINT_32 ratioLow,
+    UINT_32 ratioHi,
+    DOUBLE  memoryBudget,
+    BOOL_32 newBlockTypeBigger)
+{
+    BOOL_32 accept = FALSE;
+
+    if (memoryBudget >= 1.0)
+    {
+        if (newBlockTypeBigger)
+        {
+            if ((static_cast<DOUBLE>(newBlockTypeSize) / minSize) <= memoryBudget)
+            {
+                accept = TRUE;
+            }
+        }
+        else
+        {
+            if ((static_cast<DOUBLE>(minSize) / newBlockTypeSize) > memoryBudget)
+            {
+                accept = TRUE;
+            }
+        }
+    }
+    else
+    {
+        if (newBlockTypeBigger)
+        {
+            if ((newBlockTypeSize * ratioHi) <= (minSize * ratioLow))
+            {
+                accept = TRUE;
+            }
+        }
+        else
+        {
+            if ((newBlockTypeSize * ratioLow) < (minSize * ratioHi))
+            {
+                accept = TRUE;
+            }
+        }
+    }
+
+    return accept;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                    Surface functions for Addr3
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSurfaceInfo
+*
+*   @brief
+*       Calculate surface width/height/depth/alignments and suitable tiling mode
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceInfo(
+    ADDR_HANDLE                                hLib, ///< address lib handle
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT*    pIn,  ///< [in] surface information
+    ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*         pOut) ///< [out] surface parameters and alignments
+{
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceInfo(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr3GetPossibleSwizzleModes
+*
+*   @brief
+*       Get valid swizzle mode options given image input for further optimal selection
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_PARAMSIZEMISMATCH
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3GetPossibleSwizzleModes(
+    ADDR_HANDLE                                    hLib, ///< address lib handle
+    const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT*   pIn,  ///< [in] surface information
+    ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT*        pOut) ///< [out] allowable swizzle mdoes
+{
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->GetPossibleSwizzleModes(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address according to coordinates
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeSurfaceAddrFromCoord(
+    ADDR_HANDLE                                         hLib, ///< address lib handle
+    const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT*    pIn,  ///< [in] surface info and coordinates
+    ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*         pOut) ///< [out] surface address
+{
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
 }
 
-}  // namespace rocr
\ No newline at end of file
+/**
+****************************************************************************************************
+*   Addr3ComputePipeBankXor
+*
+*   @brief
+*       Calculate a valid bank pipe xor value for client to use.
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputePipeBankXor(
+    ADDR_HANDLE                            hLib, ///< handle of addrlib
+    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,  ///< [in] input
+    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut) ///< [out] output
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputePipeBankXor(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr3ComputeNonBlockCompressedView
+*
+*   @brief
+*       Compute non-block-compressed view for a given mipmap level/slice.
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeNonBlockCompressedView(
+    ADDR_HANDLE                                       hLib, ///< handle of addrlib
+    const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,  ///< [in] input
+    ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut) ///< [out] output
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeNonBlockCompressedView(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSubResourceOffsetForSwizzlePattern
+*
+*   @brief
+*       Calculate sub resource offset for swizzle pattern.
+****************************************************************************************************
+*/
+VOID ADDR_API Addr3ComputeSubResourceOffsetForSwizzlePattern(
+    ADDR_HANDLE                                                     hLib, ///< handle of addrlib
+    const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,  ///< [in] input
+    ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut) ///< [out] output
+{
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        pLib->ComputeSubResourceOffsetForSwizzlePattern(pIn, pOut);
+    }
+}
+
+/**
+****************************************************************************************************
+*   Addr3ComputeSlicePipeBankXor
+*
+*   @brief
+*       Calculate slice pipe bank xor value based on base pipe bank xor and slice id.
+****************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API Addr3ComputeSlicePipeBankXor(
+    ADDR_HANDLE                                  hLib, ///< handle of addrlib
+    const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,  ///< [in] input
+    ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut) ///< [out] output
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    V3::Lib* pLib = V3::Lib::GetLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSlicePipeBankXor(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+} //namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/amdgpu_asic_addr.h b/src/image/addrlib/src/amdgpu_asic_addr.h
index c384c138c..1909e56cb 100644
--- a/src/image/addrlib/src/amdgpu_asic_addr.h
+++ b/src/image/addrlib/src/amdgpu_asic_addr.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2017-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -33,24 +16,23 @@
 #define AMDGPU_VENDOR_IS_AMD(v) ((v == ATI_VENDOR_ID) || (v == AMD_VENDOR_ID))
 
 #define FAMILY_UNKNOWN 0x00
-#define FAMILY_TN      0x69
-#define FAMILY_SI      0x6E
-#define FAMILY_CI      0x78
-#define FAMILY_KV      0x7D
-#define FAMILY_VI      0x82
-#define FAMILY_POLARIS 0x82
-#define FAMILY_CZ      0x87
-#define FAMILY_AI      0x8D
-#define FAMILY_RV      0x8E
-#define FAMILY_NV      0x8F
-#define FAMILY_VGH     0x90
-#define FAMILY_GFX1100 0x91
-#define FAMILY_GFX1103 0x94
+#define FAMILY_TN      0x69 //# 105 / Trinity APUs
+#define FAMILY_SI      0x6E //# 110 / Southern Islands: Tahiti, Pitcairn, CapeVerde, Oland, Hainan
+#define FAMILY_CI      0x78 //# 120 / Sea Islands: Bonaire, Hawaii
+#define FAMILY_KV      0x7D //# 125 / Kaveri APUs: Spectre, Spooky, Kalindi, Godavari
+#define FAMILY_VI      0x82 //# 130 / Volcanic Islands: Iceland, Tonga, Fiji
+#define FAMILY_CZ      0x87 //# 135 / Carrizo APUs: Carrizo, Stoney
+#define FAMILY_AI      0x8D //# 141 / Vega: 10, 20
+#define FAMILY_RV      0x8E //# 142 / Raven
+#define FAMILY_NV      0x8F //# 143 / Navi: 10
+#define FAMILY_VGH     0x90 //# 144 / Van Gogh
+#define FAMILY_NV3     0x91 //# 145 / Navi: 3x
 #define FAMILY_GFX1150 0x96
-#define FAMILY_RMB     0x92
-#define FAMILY_GC_10_3_6  0x95
-#define FAMILY_GC_10_3_7  0x97
-
+#define FAMILY_GFX1103 0x94
+#define FAMILY_RMB     0x92 //# 146 / Rembrandt
+#define FAMILY_RPL     0x95 //# 149 / Raphael
+#define FAMILY_MDN     0x97 //# 151 / Mendocino
+#define FAMILY_GFX12   0x98
 
 // AMDGPU_FAMILY_IS(familyId, familyName)
 #define FAMILY_IS(f, fn)     (f == FAMILY_##fn)
@@ -64,70 +46,72 @@
 #define FAMILY_IS_AI(f)      FAMILY_IS(f, AI)
 #define FAMILY_IS_RV(f)      FAMILY_IS(f, RV)
 #define FAMILY_IS_NV(f)      FAMILY_IS(f, NV)
+#define FAMILY_IS_NV3(f)     FAMILY_IS(f, NV3)
 #define FAMILY_IS_RMB(f)     FAMILY_IS(f, RMB)
-#define FAMILY_IS_GFX1100(f) FAMILY_IS(f, GFX1100)
-#define FAMILY_IS_GFX1103(f) FAMILY_IS(f, GFX1103)
-#define FAMILY_IS_GFX1150(f) FAMILY_IS(f, GFX1150)
+#define FAMILY_IS_GFX12(f)   FAMILY_IS(f, GFX12)
 
 #define AMDGPU_UNKNOWN          0xFF
 
-#define AMDGPU_TAHITI_RANGE     0x05, 0x14
-#define AMDGPU_PITCAIRN_RANGE   0x15, 0x28
-#define AMDGPU_CAPEVERDE_RANGE  0x29, 0x3C
-#define AMDGPU_OLAND_RANGE      0x3C, 0x46
-#define AMDGPU_HAINAN_RANGE     0x46, 0xFF
+#define AMDGPU_TAHITI_RANGE     0x05, 0x14 //#  5 <= x < 20
+#define AMDGPU_PITCAIRN_RANGE   0x15, 0x28 //# 21 <= x < 40
+#define AMDGPU_CAPEVERDE_RANGE  0x29, 0x3C //# 41 <= x < 60
+#define AMDGPU_OLAND_RANGE      0x3C, 0x46 //# 60 <= x < 70
+#define AMDGPU_HAINAN_RANGE     0x46, 0xFF //# 70 <= x < max
+
+#define AMDGPU_BONAIRE_RANGE    0x14, 0x28 //# 20 <= x < 40
+#define AMDGPU_HAWAII_RANGE     0x28, 0x3C //# 40 <= x < 60
+
+#define AMDGPU_SPECTRE_RANGE    0x01, 0x41 //#   1 <= x < 65
+#define AMDGPU_SPOOKY_RANGE     0x41, 0x81 //#  65 <= x < 129
+#define AMDGPU_KALINDI_RANGE    0x81, 0xA1 //# 129 <= x < 161
+#define AMDGPU_GODAVARI_RANGE   0xA1, 0xFF //# 161 <= x < max
 
-#define AMDGPU_BONAIRE_RANGE    0x14, 0x28
-#define AMDGPU_HAWAII_RANGE     0x28, 0x3C
+#define AMDGPU_ICELAND_RANGE    0x01, 0x14 //#  1 <= x < 20
+#define AMDGPU_TONGA_RANGE      0x14, 0x28 //# 20 <= x < 40
+#define AMDGPU_FIJI_RANGE       0x3C, 0x50 //# 60 <= x < 80
 
-#define AMDGPU_SPECTRE_RANGE    0x01, 0x41
-#define AMDGPU_SPOOKY_RANGE     0x41, 0x81
-#define AMDGPU_KALINDI_RANGE    0x81, 0xA1
-#define AMDGPU_GODAVARI_RANGE   0xA1, 0xFF
+#define AMDGPU_POLARIS10_RANGE  0x50, 0x5A //#  80 <= x < 90
+#define AMDGPU_POLARIS11_RANGE  0x5A, 0x64 //#  90 <= x < 100
+#define AMDGPU_POLARIS12_RANGE  0x64, 0x6E //# 100 <= x < 110
+#define AMDGPU_VEGAM_RANGE      0x6E, 0xFF //# 110 <= x < max
 
-#define AMDGPU_ICELAND_RANGE    0x01, 0x14
-#define AMDGPU_TONGA_RANGE      0x14, 0x28
-#define AMDGPU_FIJI_RANGE       0x3C, 0x50
+#define AMDGPU_CARRIZO_RANGE    0x01, 0x21 //#  1 <= x < 33
+#define AMDGPU_BRISTOL_RANGE    0x10, 0x21 //# 16 <= x < 33
+#define AMDGPU_STONEY_RANGE     0x61, 0xFF //# 97 <= x < max
 
-#define AMDGPU_POLARIS10_RANGE  0x50, 0x5A
-#define AMDGPU_POLARIS11_RANGE  0x5A, 0x64
-#define AMDGPU_POLARIS12_RANGE  0x64, 0x6E
-#define AMDGPU_VEGAM_RANGE      0x6E, 0xFF
+#define AMDGPU_VEGA10_RANGE     0x01, 0x14 //#  1 <= x < 20
+#define AMDGPU_VEGA12_RANGE     0x14, 0x28 //# 20 <= x < 40
+#define AMDGPU_VEGA20_RANGE     0x28, 0xFF //# 40 <= x < max
 
-#define AMDGPU_CARRIZO_RANGE    0x01, 0x21
-#define AMDGPU_STONEY_RANGE     0x61, 0xFF
+#define AMDGPU_RAVEN_RANGE      0x01, 0x81 //#   1 <= x < 129
+#define AMDGPU_RAVEN2_RANGE     0x81, 0x90 //# 129 <= x < 144
+#define AMDGPU_RENOIR_RANGE     0x91, 0xFF //# 145 <= x < max
 
-#define AMDGPU_VEGA10_RANGE     0x01, 0x14
-#define AMDGPU_VEGA12_RANGE     0x14, 0x28
-#define AMDGPU_VEGA20_RANGE     0x28, 0x32
-#define AMDGPU_ARCTURUS_RANGE   0x32, 0x3C
-#define AMDGPU_ALDEBARAN_RANGE  0x3C, 0xFF
+#define AMDGPU_NAVI10_RANGE     0x01, 0x0A //# 1  <= x < 10
+#define AMDGPU_NAVI12_RANGE     0x0A, 0x14 //# 10 <= x < 20
+#define AMDGPU_NAVI14_RANGE     0x14, 0x28 //# 20 <= x < 40
+#define AMDGPU_NAVI21_RANGE     0x28, 0x32 //# 40  <= x < 50
+#define AMDGPU_NAVI22_RANGE     0x32, 0x3C //# 50  <= x < 60
+#define AMDGPU_NAVI23_RANGE     0x3C, 0x46 //# 60  <= x < 70
+#define AMDGPU_NAVI24_RANGE     0x46, 0x50 //# 70  <= x < 80
 
-#define AMDGPU_RAVEN_RANGE      0x01, 0x81
-#define AMDGPU_RAVEN2_RANGE     0x81, 0x91
-#define AMDGPU_RENOIR_RANGE     0x91, 0xFF
+#define AMDGPU_VANGOGH_RANGE    0x01, 0xFF //# 1 <= x < max
 
-#define AMDGPU_NAVI10_RANGE     0x01, 0x0A
-#define AMDGPU_NAVI12_RANGE     0x0A, 0x14
-#define AMDGPU_NAVI14_RANGE     0x14, 0x28
-#define AMDGPU_NAVI21_RANGE     0x28, 0x32
-#define AMDGPU_NAVI22_RANGE     0x32, 0x3C
-#define AMDGPU_NAVI23_RANGE     0x3C, 0x46
-#define AMDGPU_NAVI24_RANGE     0x46, 0x50
+#define AMDGPU_NAVI31_RANGE     0x01, 0x10 //# 01 <= x < 16
+#define AMDGPU_NAVI32_RANGE     0x20, 0xFF //# 32 <= x < 255
+#define AMDGPU_NAVI33_RANGE     0x10, 0x20 //# 16 <= x < 32
+#define AMDGPU_GFX1103_R1_RANGE 0x01, 0x80 //# 1 <= x < 128
+#define AMDGPU_GFX1103_R2_RANGE 0x80, 0xC0 //# 128 <= x < 192
 
-#define AMDGPU_VANGOGH_RANGE    0x01, 0xFF
+#define AMDGPU_GFX1150_RANGE    0x01, 0xFF //# 1 <= x < max
 
-#define AMDGPU_GFX1100_RANGE    0x01, 0x10
-#define AMDGPU_GFX1101_RANGE    0x20, 0xFF
-#define AMDGPU_GFX1102_RANGE    0x10, 0x20
-#define AMDGPU_GFX1103_RANGE    0x01, 0xFF
-#define AMDGPU_GFX1150_RANGE    0x01, 0xFF
+#define AMDGPU_REMBRANDT_RANGE  0x01, 0xFF //# 01 <= x < 255
 
-#define AMDGPU_REMBRANDT_RANGE  0x01, 0xFF
+#define AMDGPU_RAPHAEL_RANGE    0x01, 0xFF //# 1 <= x < max
 
-#define AMDGPU_GFX1036_RANGE    0x01, 0xFF
+#define AMDGPU_MENDOCINO_RANGE  0x01, 0xFF //# 1 <= x < max
 
-#define AMDGPU_GFX1037_RANGE    0x01, 0xFF
+#define AMDGPU_GFX12_TBD1_RANGE 0x40, 0xFF //# 64 <= x < max
 
 #define AMDGPU_EXPAND_FIX(x) x
 #define AMDGPU_RANGE_HELPER(val, min, max) ((val >= min) && (val < max))
@@ -160,6 +144,7 @@
 #define ASICREV_IS_VEGAM_P(r)          ASICREV_IS(r, VEGAM)
 
 #define ASICREV_IS_CARRIZO(r)          ASICREV_IS(r, CARRIZO)
+#define ASICREV_IS_CARRIZO_BRISTOL(r)  ASICREV_IS(r, BRISTOL)
 #define ASICREV_IS_STONEY(r)           ASICREV_IS(r, STONEY)
 
 #define ASICREV_IS_VEGA10_M(r)         ASICREV_IS(r, VEGA10)
@@ -167,8 +152,6 @@
 #define ASICREV_IS_VEGA12_P(r)         ASICREV_IS(r, VEGA12)
 #define ASICREV_IS_VEGA12_p(r)         ASICREV_IS(r, VEGA12)
 #define ASICREV_IS_VEGA20_P(r)         ASICREV_IS(r, VEGA20)
-#define ASICREV_IS_ARCTURUS(r)         ASICREV_IS(r, ARCTURUS)
-#define ASICREV_IS_ALDEBARAN(r)        ASICREV_IS(r, ALDEBARAN)
 
 #define ASICREV_IS_RAVEN(r)            ASICREV_IS(r, RAVEN)
 #define ASICREV_IS_RAVEN2(r)           ASICREV_IS(r, RAVEN2)
@@ -190,16 +173,20 @@
 
 #define ASICREV_IS_VANGOGH(r)          ASICREV_IS(r, VANGOGH)
 
-#define ASICREV_IS_GFX1100(r)          ASICREV_IS(r, GFX1100)
-#define ASICREV_IS_GFX1101(r)          ASICREV_IS(r, GFX1101)
-#define ASICREV_IS_GFX1102(r)          ASICREV_IS(r, GFX1102)
-#define ASICREV_IS_GFX1103(r)          ASICREV_IS(r, GFX1103)
+#define ASICREV_IS_NAVI31_P(r)         ASICREV_IS(r, NAVI31)
+#define ASICREV_IS_NAVI32_P(r)         ASICREV_IS(r, NAVI32)
+#define ASICREV_IS_NAVI33_P(r)         ASICREV_IS(r, NAVI33)
+#define ASICREV_IS_GFX1150(r)          ASICREV_IS(r, GFX1150)
+#define ASICREV_IS_GFX1103_R1(r)       ASICREV_IS(r, GFX1103_R1)
+#define ASICREV_IS_GFX1103_R2(r)       ASICREV_IS(r, GFX1103_R2)
 #define ASICREV_IS_GFX1150(r)          ASICREV_IS(r, GFX1150)
 
 #define ASICREV_IS_REMBRANDT(r)        ASICREV_IS(r, REMBRANDT)
 
-#define ASICREV_IS_GFX1036(r)          ASICREV_IS(r, GFX1036)
+#define ASICREV_IS_RAPHAEL(r)          ASICREV_IS(r, RAPHAEL)
+
+#define ASICREV_IS_MENDOCINO(r)        ASICREV_IS(r, MENDOCINO)
 
-#define ASICREV_IS_GFX1037(r)          ASICREV_IS(r, GFX1037)
+#define ASICREV_IS_GFX12_TBD1_P(r)     ASICREV_IS(r, GFX12_TBD1)
 
 #endif // _AMDGPU_ASIC_ADDR_H
diff --git a/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h b/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h
index 7383c4e06..9f0521c1f 100644
--- a/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h
+++ b/src/image/addrlib/src/chip/gfx10/gfx10_gb_reg.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
diff --git a/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h b/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h
index 99a66c08d..12ab84da8 100644
--- a/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h
+++ b/src/image/addrlib/src/chip/gfx11/gfx11_gb_reg.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -74,3 +57,4 @@ union GB_ADDR_CONFIG_GFX11
 };
 
 #endif
+
diff --git a/src/image/addrlib/src/chip/gfx12/gfx12_gb_reg.h b/src/image/addrlib/src/chip/gfx12/gfx12_gb_reg.h
new file mode 100644
index 000000000..389b3871d
--- /dev/null
+++ b/src/image/addrlib/src/chip/gfx12/gfx12_gb_reg.h
@@ -0,0 +1,57 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2007-2023 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+
+#if !defined (__GFX12_GB_REG_H__)
+#define __GFX12_GB_REG_H__
+
+/*
+*    gfx12_gb_reg.h
+*
+*    Register Spec Release:  1.0
+*
+*/
+
+//
+// Make sure the necessary endian defines are there.
+//
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+union GB_ADDR_CONFIG_GFX12 {
+    struct {
+#if defined(LITTLEENDIAN_CPU)
+        unsigned int                       NUM_PIPES : 3;
+        unsigned int            PIPE_INTERLEAVE_SIZE : 3;
+        unsigned int            MAX_COMPRESSED_FRAGS : 2;
+        unsigned int                        NUM_PKRS : 3;
+        unsigned int                                 : 8;
+        unsigned int              NUM_SHADER_ENGINES : 4;
+        unsigned int                                 : 3;
+        unsigned int                   NUM_RB_PER_SE : 2;
+        unsigned int                                 : 4;
+#elif defined(BIGENDIAN_CPU)
+        unsigned int                                 : 4;
+        unsigned int                   NUM_RB_PER_SE : 2;
+        unsigned int                                 : 3;
+        unsigned int              NUM_SHADER_ENGINES : 4;
+        unsigned int                                 : 8;
+        unsigned int                        NUM_PKRS : 3;
+        unsigned int            MAX_COMPRESSED_FRAGS : 2;
+        unsigned int            PIPE_INTERLEAVE_SIZE : 3;
+        unsigned int                       NUM_PIPES : 3;
+#endif
+    } bitfields, bits;
+    unsigned int    u32All;
+    int             i32All;
+    float           f32All;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h b/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h
index b0be682cc..8ff6939ab 100644
--- a/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h
+++ b/src/image/addrlib/src/chip/gfx9/gfx9_gb_reg.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
diff --git a/src/image/addrlib/src/chip/r800/si_gb_reg.h b/src/image/addrlib/src/chip/r800/si_gb_reg.h
index 3f5f4071e..c5bb578f9 100644
--- a/src/image/addrlib/src/chip/r800/si_gb_reg.h
+++ b/src/image/addrlib/src/chip/r800/si_gb_reg.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -93,9 +76,52 @@
 
 #endif
 
+#if     defined(LITTLEENDIAN_CPU)
+
+     typedef struct _GB_ADDR_CONFIG_N {
+          unsigned int num_pipes                      : 3;
+          unsigned int pipe_interleave_size           : 3;
+          unsigned int max_compressed_frags           : 2;
+          unsigned int bank_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int num_banks                      : 3;
+          unsigned int                                : 1;
+          unsigned int shader_engine_tile_size        : 3;
+          unsigned int num_shader_engines             : 2;
+          unsigned int num_gpus                       : 3;
+          unsigned int multi_gpu_tile_size            : 2;
+          unsigned int num_rb_per_se                  : 2;
+          unsigned int row_size                       : 2;
+          unsigned int num_lower_pipes                : 1;
+          unsigned int se_enable                      : 1;
+     } GB_ADDR_CONFIG_N;
+
+#elif       defined(BIGENDIAN_CPU)
+
+     typedef struct _GB_ADDR_CONFIG_N {
+          unsigned int se_enable                      : 1;
+          unsigned int num_lower_pipes                : 1;
+          unsigned int row_size                       : 2;
+          unsigned int num_rb_per_se                  : 2;
+          unsigned int multi_gpu_tile_size            : 2;
+          unsigned int num_gpus                       : 3;
+          unsigned int num_shader_engines             : 2;
+          unsigned int shader_engine_tile_size        : 3;
+          unsigned int                                : 1;
+          unsigned int num_banks                      : 3;
+          unsigned int                                : 1;
+          unsigned int bank_interleave_size           : 3;
+          unsigned int max_compressed_frags           : 2;
+          unsigned int pipe_interleave_size           : 3;
+          unsigned int num_pipes                      : 3;
+     } GB_ADDR_CONFIG_N;
+
+#endif
+
 typedef union {
      unsigned int val : 32;
      GB_ADDR_CONFIG_T f;
+     GB_ADDR_CONFIG_N n;
 } GB_ADDR_CONFIG;
 
 #if       defined(LITTLEENDIAN_CPU)
diff --git a/src/image/addrlib/src/core/addrcommon.h b/src/image/addrlib/src/core/addrcommon.h
index 6b8fa0a5c..894892574 100644
--- a/src/image/addrlib/src/core/addrcommon.h
+++ b/src/image/addrlib/src/core/addrcommon.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -42,9 +25,13 @@
 #endif
 
 #if defined(__GNUC__)
+    #include <signal.h>
     #include <assert.h>
 #endif
 
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Platform specific debug break defines
@@ -89,7 +76,13 @@
     #else
         #define ADDR_ASSERT(__e) if ( !((__e) ? TRUE : FALSE)) { ADDR_DBG_BREAK(); }
     #endif
-    #define ADDR_ASSERT_ALWAYS() ADDR_DBG_BREAK()
+
+    #if ADDR_SILENCE_ASSERT_ALWAYS
+        #define ADDR_ASSERT_ALWAYS()
+    #else
+        #define ADDR_ASSERT_ALWAYS() ADDR_DBG_BREAK()
+    #endif
+
     #define ADDR_UNHANDLED_CASE() ADDR_ASSERT(!"Unhandled case")
     #define ADDR_NOT_IMPLEMENTED() ADDR_ASSERT(!"Not implemented");
 #else //DEBUG
@@ -192,9 +185,11 @@
 #endif
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
 
+namespace V1
+{
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Common constants
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -318,6 +313,49 @@ static inline UINT_32 XorReduce(
     return result;
 }
 
+/**
+****************************************************************************************************
+*   Unset least bit
+*
+*   @brief
+*       Returns a copy of the value with the least-significant '1' bit unset
+****************************************************************************************************
+*/
+static inline UINT_32 UnsetLeastBit(
+    UINT_32 val)
+{
+    return val & (val - 1);
+}
+
+/**
+****************************************************************************************************
+*   BitScanForward
+*
+*   @brief
+*       Returns the index-position of the least-significant '1' bit. Must not be 0.
+****************************************************************************************************
+*/
+static inline UINT_32 BitScanForward(
+    UINT_32 mask) ///< [in] Bitmask to scan
+{
+    ADDR_ASSERT(mask > 0);
+    unsigned long out = 0;
+#if (defined(_WIN64) && defined(_M_X64)) || (defined(_WIN32) && defined(_M_IX64))
+    out = ::_tzcnt_u32(mask);
+#elif (defined(_WIN32) || defined(_WIN64))
+    ::_BitScanForward(&out, mask);
+#elif defined(__GNUC__)
+    out = __builtin_ctz(mask);
+#else
+    while ((mask & 1) == 0)
+    {
+        mask >>= 1;
+        out++;
+    }
+#endif
+    return out;
+}
+
 /**
 ****************************************************************************************************
 *   IsPow2
@@ -974,6 +1012,37 @@ static inline UINT_32 GetCoordActiveMask(
     return mask;
 }
 
+/**
+****************************************************************************************************
+*   FillEqBitComponents
+*
+*   @brief
+*       Fill the 'numBitComponents' field based on the equation.
+****************************************************************************************************
+*/
+static inline void FillEqBitComponents(
+    ADDR_EQUATION *pEquation) // [in/out] Equation to calculate bit components for
+{
+    pEquation->numBitComponents = 1; // We always have at least the address
+    for (UINT_32 xorN = 1; xorN < ADDR_MAX_EQUATION_COMP; xorN++)
+    {
+        for (UINT_32 bit = 0; bit < ADDR_MAX_EQUATION_BIT; bit++)
+        {
+            if (pEquation->comps[xorN][bit].valid)
+            {
+                pEquation->numBitComponents = xorN + 1;
+                break;
+            }
+        }
+
+        if (pEquation->numBitComponents != (xorN + 1))
+        {
+            // Skip following components if this one wasn't valid
+            break;
+        }
+    }
+}
+
 /**
 ****************************************************************************************************
 *   ShiftCeil
@@ -1005,7 +1074,7 @@ static inline UINT_32 ShiftRight(
 }
 
 } // Addr
-} // rocr
+} // namespace rocr
 
 #endif // __ADDR_COMMON_H__
 
diff --git a/src/image/addrlib/src/core/addrelemlib.cpp b/src/image/addrlib/src/core/addrelemlib.cpp
index b3bff74fc..615d8f0e3 100644
--- a/src/image/addrlib/src/core/addrelemlib.cpp
+++ b/src/image/addrlib/src/core/addrelemlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -34,7 +17,8 @@
 #include "addrlib.h"
 
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ****************************************************************************************************
@@ -1407,6 +1391,10 @@ UINT_32 ElemLib::GetBitsPerPixel(
         case ADDR_FMT_24_8:
             bpp = 32;
             break;
+        case ADDR_FMT_BG_RG_16_16_16_16:
+            elemMode = ADDR_PACKED_BGRG;
+            bpp = 32;
+            break;
         case ADDR_FMT_16_16_16_16:
         case ADDR_FMT_32_32:
         case ADDR_FMT_CTX1:
@@ -1818,6 +1806,7 @@ BOOL_32 ElemLib::IsMacroPixelPacked(
     {
         case ADDR_FMT_BG_RG:
         case ADDR_FMT_GB_GR:
+        case ADDR_FMT_BG_RG_16_16_16_16:
             isMacroPixelPacked = TRUE;
             break;
         default:
@@ -1827,5 +1816,5 @@ BOOL_32 ElemLib::IsMacroPixelPacked(
     return isMacroPixelPacked;
 }
 
-} // Addr
-} // rocr
+}
+} //namespace rocr
diff --git a/src/image/addrlib/src/core/addrelemlib.h b/src/image/addrlib/src/core/addrelemlib.h
index 308c9844b..3352279de 100644
--- a/src/image/addrlib/src/core/addrelemlib.h
+++ b/src/image/addrlib/src/core/addrelemlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -39,7 +22,8 @@
 #include "addrcommon.h"
 
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 class Lib;
 
@@ -273,8 +257,7 @@ class ElemLib : public Object
     Addr::Lib* const    m_pAddrLib;         ///< Pointer to parent addrlib instance
 };
 
-} // Addr
-} // rocr
-
+} //Addr
+} //namespace rocr
 #endif
 
diff --git a/src/image/addrlib/src/core/addrlib.cpp b/src/image/addrlib/src/core/addrlib.cpp
index a958cd11e..d7d322f8d 100644
--- a/src/image/addrlib/src/core/addrlib.cpp
+++ b/src/image/addrlib/src/core/addrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -81,7 +64,8 @@ UINT_32 __umoddi3(UINT_64 n, UINT_32 base)
 #endif // __APPLE__
 
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                               Constructor/Destructor
@@ -228,15 +212,18 @@ ADDR_E_RETURNCODE Lib::Create(
                     case FAMILY_NV:
                     case FAMILY_VGH:
                     case FAMILY_RMB:
-                    case FAMILY_GC_10_3_6:
-                    case FAMILY_GC_10_3_7:
+                    case FAMILY_RPL:
+                    case FAMILY_MDN:
                         pLib = Gfx10HwlInit(&client);
                         break;
-                    case FAMILY_GFX1100:
-                    case FAMILY_GFX1103:
+                    case FAMILY_NV3:
                     case FAMILY_GFX1150:
+                    case FAMILY_GFX1103:
                         pLib = Gfx11HwlInit(&client);
                         break;
+                    case FAMILY_GFX12:
+                        pLib = Gfx12HwlInit(&client);
+                        break;
                     default:
                         ADDR_ASSERT_ALWAYS();
                         break;
@@ -247,7 +234,10 @@ ADDR_E_RETURNCODE Lib::Create(
                 break;
         }
     }
-
+    if(pLib == NULL)
+    {
+        returnCode = ADDR_OUTOFMEMORY;
+    }
     if (pLib != NULL)
     {
         BOOL_32 initValid;
@@ -286,6 +276,7 @@ ADDR_E_RETURNCODE Lib::Create(
         {
             delete pLib;
             pLib = NULL;
+            returnCode = ADDR_OUTOFMEMORY;
             ADDR_ASSERT_ALWAYS();
         }
         else
@@ -305,12 +296,6 @@ ADDR_E_RETURNCODE Lib::Create(
         pLib->SetMaxAlignments();
 
     }
-    else if ((pLib == NULL) &&
-             (returnCode == ADDR_OK))
-    {
-        // Unknown failures, we return the general error code
-        returnCode = ADDR_ERROR;
-    }
 
     return returnCode;
 }
@@ -673,4 +658,4 @@ UINT_32 Lib::GetBpe(AddrFormat format) const
 }
 
 } // Addr
-} // rocr
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/core/addrlib.h b/src/image/addrlib/src/core/addrlib.h
index 0d16762a5..cce002ab1 100644
--- a/src/image/addrlib/src/core/addrlib.h
+++ b/src/image/addrlib/src/core/addrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -34,6 +17,7 @@
 #define __ADDR_LIB_H__
 
 #include "addrinterface.h"
+#include "addrtypes.h"
 #include "addrobject.h"
 #include "addrelemlib.h"
 
@@ -56,7 +40,8 @@
 #endif
 
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ****************************************************************************************************
@@ -266,7 +251,7 @@ class Lib : public Object
     }
 
     /// Returns asic chip family name defined by AddrLib
-    ChipFamily GetChipFamily()
+    ChipFamily GetChipFamily() const
     {
         return m_chipFamily;
     }
@@ -316,6 +301,21 @@ class Lib : public Object
 #endif
     }
 
+    static BOOL_32 IsTex1d(AddrResourceType resourceType)
+    {
+        return (resourceType == ADDR_RSRC_TEX_1D);
+    }
+
+    static BOOL_32 IsTex2d(AddrResourceType resourceType)
+    {
+        return (resourceType == ADDR_RSRC_TEX_2D);
+    }
+
+    static BOOL_32 IsTex3d(AddrResourceType resourceType)
+    {
+        return (resourceType == ADDR_RSRC_TEX_3D);
+    }
+
     //
     // Initialization
     //
@@ -408,7 +408,7 @@ Lib* CiHwlInit   (const Client* pClient);
 Lib* Gfx9HwlInit (const Client* pClient);
 Lib* Gfx10HwlInit(const Client* pClient);
 Lib* Gfx11HwlInit(const Client* pClient);
+Lib* Gfx12HwlInit(const Client* pClient);
 } // Addr
-} // rocr
-
+} // namespace rocr
 #endif
diff --git a/src/image/addrlib/src/core/addrlib1.cpp b/src/image/addrlib/src/core/addrlib1.cpp
index c6ab5b3da..c99d0af0a 100644
--- a/src/image/addrlib/src/core/addrlib1.cpp
+++ b/src/image/addrlib/src/core/addrlib1.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -35,8 +18,10 @@
 #include "addrcommon.h"
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
+namespace V1
+{
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                               Static Const Member
@@ -2994,6 +2979,7 @@ ADDR_E_RETURNCODE Lib::ComputeMicroTileEquation(
     // stackedDepthSlices is used for addressing mode that a tile block contains multiple slices,
     // which is not supported by our address lib
     pEquation->stackedDepthSlices = FALSE;
+    pEquation->numBitComponents   = 1;
 
     return retCode;
 }
@@ -4070,4 +4056,4 @@ ADDR_E_RETURNCODE Lib::ComputePrtInfo(
 
 } // V1
 } // Addr
-} // namespace rocr
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/core/addrlib1.h b/src/image/addrlib/src/core/addrlib1.h
index a6b7fe32d..8b5dde206 100644
--- a/src/image/addrlib/src/core/addrlib1.h
+++ b/src/image/addrlib/src/core/addrlib1.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -37,8 +20,10 @@
 #include "addrlib.h"
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
+namespace V1
+{
 
 /**
 ****************************************************************************************************
diff --git a/src/image/addrlib/src/core/addrlib2.cpp b/src/image/addrlib/src/core/addrlib2.cpp
index e23029100..43d4c0f9d 100644
--- a/src/image/addrlib/src/core/addrlib2.cpp
+++ b/src/image/addrlib/src/core/addrlib2.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -36,8 +19,10 @@
 #include "addrcommon.h"
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //                               Static Const Member
@@ -302,6 +287,12 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
             if (localIn.flags.needEquation && (Log2(localIn.numFrags) == 0))
             {
                 pOut->equationIndex = GetEquationIndex(&localIn, pOut);
+                if ((localIn.flags.allowExtEquation == 0) &&
+                    (pOut->equationIndex != ADDR_INVALID_EQUATION_INDEX) &&
+                    (m_equationTable[pOut->equationIndex].numBitComponents > ADDR_MAX_LEGACY_EQUATION_COMP))
+                {
+                    pOut->equationIndex = ADDR_INVALID_EQUATION_INDEX;
+                }
             }
 
             if (localIn.flags.qbStereo)
@@ -1177,6 +1168,7 @@ ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear(
         ADDR2_COMPUTE_SURFACE_INFO_INPUT  localIn  = {0};
         ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0};
         ADDR2_MIP_INFO                    mipInfo[MaxMipLevels];
+        ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels);
 
         localIn.bpp          = pIn->bpp;
         localIn.flags        = pIn->flags;
@@ -1852,6 +1844,61 @@ ADDR_E_RETURNCODE Lib::Addr2GetPreferredSurfaceSetting(
     return returnCode;
 }
 
+/**
+************************************************************************************************************************
+*   Lib::GetPossibleSwizzleModes
+*
+*   @brief
+*       Returns a list of swizzle modes that are valid from the hardware's perspective for the client to choose from
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::GetPossibleSwizzleModes(
+    const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+    ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut) const
+{
+    return HwlGetPossibleSwizzleModes(pIn, pOut);
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetAllowedBlockSet
+*
+*   @brief
+*       Returns the set of allowed block sizes given the allowed swizzle modes and resource type
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::GetAllowedBlockSet(
+    ADDR2_SWMODE_SET allowedSwModeSet,
+    AddrResourceType rsrcType,
+    ADDR2_BLOCK_SET* pAllowedBlockSet) const
+{
+    return HwlGetAllowedBlockSet(allowedSwModeSet, rsrcType, pAllowedBlockSet);
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetAllowedSwSet
+*
+*   @brief
+*       Returns the set of allowed swizzle types given the allowed swizzle modes
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::GetAllowedSwSet(
+    ADDR2_SWMODE_SET  allowedSwModeSet,
+    ADDR2_SWTYPE_SET* pAllowedSwSet) const
+{
+    return HwlGetAllowedSwSet(allowedSwModeSet, pAllowedSwSet);
+}
+
 /**
 ************************************************************************************************************************
 *   Lib::ComputeBlock256Equation
@@ -2000,7 +2047,8 @@ VOID Lib::ComputeQbStereoInfo(
 VOID Lib::FilterInvalidEqSwizzleMode(
     ADDR2_SWMODE_SET& allowedSwModeSet,
     AddrResourceType  resourceType,
-    UINT_32           elemLog2
+    UINT_32           elemLog2,
+    UINT_32           maxComponents
     ) const
 {
     if (resourceType != ADDR_RSRC_TEX_1D)
@@ -2013,7 +2061,12 @@ VOID Lib::FilterInvalidEqSwizzleMode(
         {
             if (validSwModeSet & 1)
             {
-                if (m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2] == ADDR_INVALID_EQUATION_INDEX)
+                UINT_32 equation = m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2];
+                if (equation == ADDR_INVALID_EQUATION_INDEX)
+                {
+                    allowedSwModeSetVal &= ~(1u << swModeIdx);
+                }
+                else if (m_equationTable[equation].numBitComponents > maxComponents)
                 {
                     allowedSwModeSetVal &= ~(1u << swModeIdx);
                 }
@@ -2030,94 +2083,6 @@ VOID Lib::FilterInvalidEqSwizzleMode(
     }
 }
 
-/**
-************************************************************************************************************************
-*   Lib::IsBlockTypeAvaiable
-*
-*   @brief
-*       Determine whether a block type is allowed in a given blockSet
-*
-*   @return
-*       N/A
-************************************************************************************************************************
-*/
-BOOL_32 Lib::IsBlockTypeAvaiable(
-    ADDR2_BLOCK_SET blockSet,
-    AddrBlockType   blockType)
-{
-    BOOL_32 avail;
-
-    if (blockType == AddrBlockLinear)
-    {
-        avail = blockSet.linear ? TRUE : FALSE;
-    }
-    else
-    {
-        avail = blockSet.value & (1 << (static_cast<UINT_32>(blockType) - 1)) ? TRUE : FALSE;
-    }
-
-    return avail;
-}
-
-/**
-************************************************************************************************************************
-*   Lib::BlockTypeWithinMemoryBudget
-*
-*   @brief
-*       Determine whether a new block type is acceptible based on memory waste ratio
-*
-*   @return
-*       N/A
-************************************************************************************************************************
-*/
-BOOL_32 Lib::BlockTypeWithinMemoryBudget(
-    UINT_64 minSize,
-    UINT_64 newBlockTypeSize,
-    UINT_32 ratioLow,
-    UINT_32 ratioHi,
-    DOUBLE  memoryBudget,
-    BOOL_32 newBlockTypeBigger)
-{
-    BOOL_32 accept = FALSE;
-
-    if (memoryBudget >= 1.0)
-    {
-        if (newBlockTypeBigger)
-        {
-            if ((static_cast<DOUBLE>(newBlockTypeSize) / minSize) <= memoryBudget)
-            {
-                accept = TRUE;
-            }
-        }
-        else
-        {
-            if ((static_cast<DOUBLE>(minSize) / newBlockTypeSize) > memoryBudget)
-            {
-                accept = TRUE;
-            }
-        }
-    }
-    else
-    {
-        if (newBlockTypeBigger)
-        {
-            if ((newBlockTypeSize * ratioHi) <= (minSize * ratioLow))
-            {
-                accept = TRUE;
-            }
-        }
-        else
-        {
-            if ((newBlockTypeSize * ratioLow) < (minSize * ratioHi))
-            {
-                accept = TRUE;
-            }
-        }
-    }
-
-    return accept;
-}
-
 #if DEBUG
 /**
 ************************************************************************************************************************
@@ -2195,4 +2160,5 @@ VOID Lib::ValidateStereoInfo(
 
 } // V2
 } // Addr
-} // rocr
+} // namespace rocr
+
diff --git a/src/image/addrlib/src/core/addrlib2.h b/src/image/addrlib/src/core/addrlib2.h
index 118306674..5abf58f03 100644
--- a/src/image/addrlib/src/core/addrlib2.h
+++ b/src/image/addrlib/src/core/addrlib2.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -37,8 +20,10 @@
 #include "addrlib.h"
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 
 /**
 ************************************************************************************************************************
@@ -147,6 +132,8 @@ union ADDR_BIT_SETTING
 * @brief Swizzle pattern information
 ************************************************************************************************************************
 */
+// Accessed by index representing the logbase2 of (8bpp/16bpp/32bpp/64bpp/128bpp)
+// contains the indices which map to 2D arrays SW_PATTERN_NIBBLE[0-9] which contain sections of an index equation. They are dependant on pipe# and bpe #
 struct ADDR_SW_PATINFO
 {
     UINT_8  maxItemCount;
@@ -305,6 +292,10 @@ class Lib : public Addr::Lib
         const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
         ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut) const;
 
+    ADDR_E_RETURNCODE GetPossibleSwizzleModes(
+        const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+        ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT* pOut) const;
+
     virtual BOOL_32 IsValidDisplaySwizzleMode(
         const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const
     {
@@ -312,11 +303,21 @@ class Lib : public Addr::Lib
         return ADDR_NOTIMPLEMENTED;
     }
 
+    ADDR_E_RETURNCODE GetAllowedBlockSet(
+        ADDR2_SWMODE_SET allowedSwModeSet,
+        AddrResourceType rsrcType,
+        ADDR2_BLOCK_SET* pAllowedBlockSet) const;
+
+    ADDR_E_RETURNCODE GetAllowedSwSet(
+        ADDR2_SWMODE_SET  allowedSwModeSet,
+        ADDR2_SWTYPE_SET* pAllowedSwSet) const;
+
 protected:
     Lib();  // Constructor is protected
     Lib(const Client* pClient);
 
     static const UINT_32 MaxNumOfBpp = 5;
+    static const UINT_32 MaxNumOfBppCMask = 4;
     static const UINT_32 MaxNumOfAA  = 4;
 
     static const Dim2d Block256_2d[MaxNumOfBpp];
@@ -669,6 +670,31 @@ class Lib : public Addr::Lib
         return ADDR_NOTSUPPORTED;
     }
 
+    virtual ADDR_E_RETURNCODE HwlGetPossibleSwizzleModes(
+        const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+        ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTSUPPORTED;
+    }
+
+    virtual ADDR_E_RETURNCODE HwlGetAllowedBlockSet(
+        ADDR2_SWMODE_SET allowedSwModeSet,
+        AddrResourceType rsrcType,
+        ADDR2_BLOCK_SET* pAllowedBlockSet) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTIMPLEMENTED;
+    }
+
+    virtual ADDR_E_RETURNCODE HwlGetAllowedSwSet(
+        ADDR2_SWMODE_SET  allowedSwModeSet,
+        ADDR2_SWTYPE_SET* pAllowedSwSet) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTIMPLEMENTED;
+    }
+
     virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfoSanityCheck(
         const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const
     {
@@ -922,17 +948,8 @@ class Lib : public Addr::Lib
     VOID FilterInvalidEqSwizzleMode(
         ADDR2_SWMODE_SET& allowedSwModeSet,
         AddrResourceType  resourceType,
-        UINT_32           elemLog2) const;
-
-    static BOOL_32 IsBlockTypeAvaiable(ADDR2_BLOCK_SET blockSet, AddrBlockType blockType);
-
-    static BOOL_32 BlockTypeWithinMemoryBudget(
-        UINT_64 minSize,
-        UINT_64 newBlockTypeSize,
-        UINT_32 ratioLow,
-        UINT_32 ratioHi,
-        DOUBLE  memoryBudget = 0.0f,
-        BOOL_32 newBlockTypeBigger = TRUE);
+        UINT_32           elemLog2,
+        UINT_32           maxComponents) const;
 
 #if DEBUG
     VOID ValidateStereoInfo(
@@ -982,7 +999,6 @@ class Lib : public Addr::Lib
 
 } // V2
 } // Addr
-} // rocr
-
+} // namespace rocr
 #endif
 
diff --git a/src/image/addrlib/src/core/addrlib3.cpp b/src/image/addrlib/src/core/addrlib3.cpp
new file mode 100644
index 000000000..a2e6c3605
--- /dev/null
+++ b/src/image/addrlib/src/core/addrlib3.cpp
@@ -0,0 +1,1073 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+
+
+/**
+************************************************************************************************************************
+* @file  addrlib3.cpp
+* @brief Contains the implementation for the AddrLib3 base class.
+************************************************************************************************************************
+*/
+
+#include "addrinterface.h"
+#include "addrlib3.h"
+#include "addrcommon.h"
+
+namespace rocr {
+namespace Addr
+{
+namespace V3
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Static Const Member
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+const Dim2d         Lib::Block256_2d[] = {{16, 16}, {16, 8}, {8, 8}, {8, 4}, {4, 4}};
+
+const ADDR_EXTENT3D Lib::Block1K_3d[]  = {{16, 8, 8}, {8, 8, 8}, {8, 8, 4}, {8, 4, 4}, {4, 4, 4}};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Constructor/Destructor
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+************************************************************************************************************************
+*   Lib::Lib
+*
+*   @brief
+*       Constructor for the Addr::V3::Lib class
+*
+************************************************************************************************************************
+*/
+Lib::Lib()
+    :
+    Addr::Lib(),
+    m_pipesLog2(0),
+    m_pipeInterleaveLog2(0),
+    m_numEquations(0)
+{
+    Init();
+}
+
+/**
+************************************************************************************************************************
+*   Lib::Lib
+*
+*   @brief
+*       Constructor for the AddrLib3 class with hClient as parameter
+*
+************************************************************************************************************************
+*/
+Lib::Lib(
+    const Client* pClient)
+    :
+    Addr::Lib(pClient),
+    m_pipesLog2(0),
+    m_pipeInterleaveLog2(0),
+    m_numEquations(0)
+{
+    Init();
+}
+
+/**
+************************************************************************************************************************
+*   Lib::Init
+*
+*   @brief
+*       Initialization of class
+*
+************************************************************************************************************************
+*/
+void Lib::Init()
+{
+    memset(m_equationTable, 0, sizeof(m_equationTable));
+
+    // There is no equation table entry for linear, so start at the "next" swizzle mode entry.
+    for (UINT_32  swizzleModeIdx = ADDR3_LINEAR + 1; swizzleModeIdx < ADDR3_MAX_TYPE; swizzleModeIdx++)
+    {
+        for (UINT_32  msaaRateIdx = 0; msaaRateIdx < MaxMsaaRateLog2; msaaRateIdx++)
+        {
+            for (UINT_32  log2BytesIdx = 0; log2BytesIdx < MaxElementBytesLog2; log2BytesIdx++)
+            {
+                SetEquationTableEntry(static_cast<Addr3SwizzleMode>(swizzleModeIdx),
+                                      msaaRateIdx,
+                                      log2BytesIdx,
+                                      ADDR_INVALID_EQUATION_INDEX);
+            }
+        }
+    }
+}
+
+/**
+************************************************************************************************************************
+*   Lib::~Lib
+*
+*   @brief
+*       Destructor for the AddrLib2 class
+*
+************************************************************************************************************************
+*/
+Lib::~Lib()
+{
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetLib
+*
+*   @brief
+*       Get Addr::V3::Lib pointer
+*
+*   @return
+*      An Addr::V2::Lib class pointer
+************************************************************************************************************************
+*/
+Lib* Lib::GetLib(
+    ADDR_HANDLE hLib)   ///< [in] handle of ADDR_HANDLE
+{
+    Addr::Lib* pAddrLib = Addr::Lib::GetLib(hLib);
+
+    return static_cast<Lib*>(hLib);
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetBlockSize
+*
+*   @brief
+*       Returns the byte size of a block for the swizzle mode.
+*
+*   @return
+*       Byte size of the block, zero if swizzle mode is invalid.
+************************************************************************************************************************
+*/
+UINT_32  Lib::GetBlockSize(
+    Addr3SwizzleMode  swizzleMode,
+    BOOL_32           forPitch
+    ) const
+{
+    return  (1 << GetBlockSizeLog2(swizzleMode, forPitch));
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetBlockSizeLog2
+*
+*   @brief
+*       Returns the log2 of the byte size of a block for the swizzle mode.
+*
+*   @return
+*       Byte size of the block, zero if swizzle mode is invalid.
+************************************************************************************************************************
+*/
+UINT_32  Lib::GetBlockSizeLog2(
+    Addr3SwizzleMode  swizzleMode,
+    BOOL_32           forPitch
+    ) const
+{
+    UINT_32  blockSize = 0;
+
+    switch (swizzleMode)
+    {
+        case ADDR3_256B_2D:
+            blockSize = 8;
+            break;
+        case ADDR3_4KB_2D:
+        case ADDR3_4KB_3D:
+            blockSize = 12;
+            break;
+        case ADDR3_64KB_2D:
+        case ADDR3_64KB_3D:
+            blockSize = 16;
+            break;
+        case ADDR3_256KB_2D:
+        case ADDR3_256KB_3D:
+            blockSize = 18;
+            break;
+        case ADDR3_LINEAR:
+            blockSize = (forPitch ? 7 : 8);
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    return  blockSize;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSurfaceInfo
+*
+*   @brief
+*       Interface function stub of ComputeSurfaceInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeSurfaceInfo(
+     const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,    ///< [in] input structure
+     ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR3_COMPUTE_SURFACE_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    // Adjust incoming parameters.
+    ADDR3_COMPUTE_SURFACE_INFO_INPUT localIn = *pIn;
+    localIn.width        = Max(pIn->width, 1u);
+    localIn.height       = Max(pIn->height, 1u);
+    localIn.numMipLevels = Max(pIn->numMipLevels, 1u);
+    localIn.numSlices    = Max(pIn->numSlices, 1u);
+    localIn.numSamples   = Max(pIn->numSamples, 1u);
+
+    UINT_32  expandX  = 1;
+    UINT_32  expandY  = 1;
+    ElemMode elemMode = ADDR_UNCOMPRESSED;
+
+    if (returnCode == ADDR_OK)
+    {
+        // Set format to INVALID will skip this conversion
+        if (localIn.format != ADDR_FMT_INVALID)
+        {
+            // Get compression/expansion factors and element mode which indicates compression/expansion
+            localIn.bpp = GetElemLib()->GetBitsPerPixel(localIn.format,
+                                                        &elemMode,
+                                                        &expandX,
+                                                        &expandY);
+
+            // Special flag for 96 bit surface. 96 (or 48 if we support) bit surface's width is
+            // pre-multiplied by 3 and bpp is divided by 3. So pitch alignment for linear-
+            // aligned does not meet 64-pixel in real. We keep special handling in hwl since hw
+            // restrictions are different.
+            // Also Mip 1+ needs an element pitch of 32 bits so we do not need this workaround
+            // but we use this flag to skip RestoreSurfaceInfo below
+            if ((elemMode == ADDR_EXPANDED) && (expandX > 1))
+            {
+                ADDR_ASSERT(IsLinear(localIn.swizzleMode));
+            }
+
+            UINT_32 basePitch = 0;
+            GetElemLib()->AdjustSurfaceInfo(elemMode,
+                                            expandX,
+                                            expandY,
+                                            &localIn.bpp,
+                                            &basePitch,
+                                            &localIn.width,
+                                            &localIn.height);
+
+            // Overwrite these parameters if we have a valid format
+        }
+
+        if (localIn.bpp != 0)
+        {
+            localIn.width  = Max(localIn.width, 1u);
+            localIn.height = Max(localIn.height, 1u);
+        }
+        else // Rule out some invalid parameters
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        returnCode = HwlComputeSurfaceInfo(&localIn, pOut);
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->bpp         = localIn.bpp;
+            pOut->pixelPitch  = pOut->pitch;
+            pOut->pixelHeight = pOut->height;
+
+            if (localIn.format != ADDR_FMT_INVALID)
+            {
+                UINT_32 pixelBits = pOut->pixelBits;
+
+                GetElemLib()->RestoreSurfaceInfo(elemMode,
+                                                 expandX,
+                                                 expandY,
+                                                 &pOut->pixelBits,
+                                                 &pOut->pixelPitch,
+                                                 &pOut->pixelHeight);
+
+                GetElemLib()->RestoreSurfaceInfo(elemMode,
+                                                 expandX,
+                                                 expandY,
+                                                 &pixelBits,
+                                                 &pOut->pixelMipChainPitch,
+                                                 &pOut->pixelMipChainHeight);
+
+                if ((localIn.numMipLevels > 1) && (pOut->pMipInfo != NULL))
+                {
+                    for (UINT_32 i = 0; i < localIn.numMipLevels; i++)
+                    {
+                        pOut->pMipInfo[i].pixelPitch  = pOut->pMipInfo[i].pitch;
+                        pOut->pMipInfo[i].pixelHeight = pOut->pMipInfo[i].height;
+
+                        GetElemLib()->RestoreSurfaceInfo(elemMode,
+                                                         expandX,
+                                                         expandY,
+                                                         &pixelBits,
+                                                         &pOut->pMipInfo[i].pixelPitch,
+                                                         &pOut->pMipInfo[i].pixelHeight);
+                    }
+                }
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetPossibleSwizzleModes
+*
+*   @brief
+*       Interface function stub of AddrComputeSurfaceInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::GetPossibleSwizzleModes(
+     const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT* pIn,    ///< [in] input structure
+     ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size  != sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT)) ||
+            (pOut->size != sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        const ADDR3_SURFACE_FLAGS flags = pIn->flags;
+
+        // VRS images can only be 2D from the client API rules.
+        ADDR_ASSERT((flags.isVrsImage == 0) || IsTex2d(pIn->resourceType));
+
+        if (pIn->bpp == 96)
+        {
+            pOut->validModes.swLinear = 1;
+        }
+        // Depth/Stencil images can't be linear and must be 2D swizzle modes.
+        // These three are related to DB block that supports only SW_64KB_2D and SW_256KB_2D for DSV.
+        else if (flags.depth || flags.stencil)
+        {
+            pOut->validModes.sw2d64kB  = 1;
+            pOut->validModes.sw2d256kB = 1;
+        }
+        // The organization of elements in the hierarchical surface is the same as any other surface, and it can support
+        // any 2D swizzle mode (SW_256_2D, SW_4KB_2D, SW_64KB_2D, or SW_256KB_2D).  The swizzle mode can be selected
+        // orthogonally to the underlying z or stencil surface.
+        else if (pIn->flags.hiZHiS)
+        {
+            pOut->validModes.sw2d256B  = 1;
+            pOut->validModes.sw2d4kB   = 1;
+            pOut->validModes.sw2d64kB  = 1;
+            pOut->validModes.sw2d256kB = 1;
+        }
+        // MSAA can't be linear and must be 2D swizzle modes.
+        else if (pIn->numSamples > 1)
+        {
+            // NOTE: SW_256B_2D still supports MSAA. The removal of 256B for MSAA is reverted in HW Doc.
+            pOut->validModes.sw2d256B  = 1;
+            pOut->validModes.sw2d4kB   = 1;
+            pOut->validModes.sw2d64kB  = 1;
+            pOut->validModes.sw2d256kB = 1;
+        }
+        // Block-compressed images need to be either using 2D or linear swizzle modes.
+        else if (flags.blockCompressed)
+        {
+            pOut->validModes.swLinear = 1;
+
+            // We find cases where Tex3d BlockCompressed image adopts 2D_256B should be prohibited.
+            if (IsTex3d(pIn->resourceType) == FALSE)
+            {
+                pOut->validModes.sw2d256B = 1;
+            }
+            pOut->validModes.sw2d4kB   = 1;
+            pOut->validModes.sw2d64kB  = 1;
+            pOut->validModes.sw2d256kB = 1;
+        }
+        else if (IsTex1d(pIn->resourceType))
+        {
+            pOut->validModes.swLinear  = 1;
+            pOut->validModes.sw2d256B  = 1;
+            pOut->validModes.sw2d4kB   = 1;
+            pOut->validModes.sw2d64kB  = 1;
+            pOut->validModes.sw2d256kB = 1;
+        }
+        else if (flags.nv12 || flags.p010 || IsTex2d(pIn->resourceType) || flags.view3dAs2dArray)
+        {
+            //      NV12 and P010 support
+            //      SW_LINEAR, SW_256B_2D, SW_4KB_2D, SW_64KB_2D, SW_256KB_2D
+            // There could be more multimedia formats that require more hw specific tiling modes...
+
+            // The exception is VRS images.
+            // Linear is not allowed and the VRS surface needs to be 8BPP format.
+            if (flags.isVrsImage)
+            {
+                ADDR_ASSERT(pIn->bpp == 8);
+            }
+            else
+            {
+                pOut->validModes.swLinear = 1;
+            }
+            if (flags.view3dAs2dArray == 0)
+            {
+                // ADDR3_256B_2D can't support 3D images.
+                pOut->validModes.sw2d256B = 1;
+            }
+            pOut->validModes.sw2d4kB   = 1;
+            pOut->validModes.sw2d64kB  = 1;
+            pOut->validModes.sw2d256kB = 1;
+        }
+        else if (IsTex3d(pIn->resourceType))
+        {
+            // An eventual determination would be based on pal setting of height_watermark and depth_watermark.
+            // However, we just adopt the simpler logic currently.
+            // For 3D images w/ view3dAs2dArray = 0, SW_3D is preferred.
+            // For 3D images w/ view3dAs2dArray = 1, it should go to 2D path above.
+            // Enable linear since client may force linear tiling for 3D texture that does not set view3dAs2dArray.
+            pOut->validModes.swLinear  = 1;
+            pOut->validModes.sw3d4kB   = 1;
+            pOut->validModes.sw3d64kB  = 1;
+            pOut->validModes.sw3d256kB = 1;
+        }
+    }
+
+    constexpr UINT_32 Size256  = 256u;
+    constexpr UINT_32 Size4K   = 4 * 1024;
+    constexpr UINT_32 Size64K  = 64 * 1024;
+    constexpr UINT_32 Size256K = 256 * 1024;
+
+    ADDR_ASSERT(pIn->maxAlign != 0);
+
+    if (pIn->maxAlign < Size256K)
+    {
+        pOut->validModes.value &= ~Gfx12Blk256KBSwModeMask;
+    }
+
+    if (pIn->maxAlign < Size64K)
+    {
+        pOut->validModes.value &= ~Gfx12Blk64KBSwModeMask;
+    }
+
+    if (pIn->maxAlign < Size4K)
+    {
+        pOut->validModes.value &= ~Gfx12Blk4KBSwModeMask;
+    }
+
+    if (pIn->maxAlign < Size256)
+    {
+        pOut->validModes.value &= ~Gfx12Blk256BSwModeMask;
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::HwlConvertChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to ChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*       ChipFamily
+************************************************************************************************************************
+*/
+ChipFamily Lib::HwlConvertChipFamily(
+    UINT_32 chipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 chipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    return ADDR_CHIP_FAMILY_NAVI;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeBlockDimensionForSurf
+*
+*   @brief
+*       Internal function to get block width/height/depth in element from surface input params.
+*
+*   @return
+*       VOID
+************************************************************************************************************************
+*/
+VOID Lib::ComputeBlockDimensionForSurf(
+    ADDR_EXTENT3D*    pExtent,
+    UINT_32           bpp,
+    UINT_32           numSamples,
+    Addr3SwizzleMode  swizzleMode
+    ) const
+{
+    const UINT_32 eleBytes     = bpp >> 3;
+    const UINT_32 log2EleBytes = Log2(eleBytes);
+    const UINT_32 log2BlkSize  = GetBlockSizeLog2(swizzleMode);
+
+    if (IsLinear(swizzleMode))
+    {
+        pExtent->width  = 1 << (log2BlkSize - log2EleBytes);
+        pExtent->height = 1;
+        pExtent->depth  = 1;
+    }
+    else if (Is3dSwizzle(swizzleMode))
+    {
+        const UINT_32 base             = (log2BlkSize / 3) - (log2EleBytes / 3);
+        const UINT_32 log2BlkSizeMod3  = log2BlkSize % 3;
+        const UINT_32 log2EleBytesMod3 = log2EleBytes % 3;
+
+        UINT_32  x = base;
+        UINT_32  y = base;
+        UINT_32  z = base;
+
+        if (log2BlkSizeMod3 > 0)
+        {
+            x++;
+        }
+
+        if (log2BlkSizeMod3 > 1)
+        {
+            z++;
+        }
+
+        if (log2EleBytesMod3 > 0)
+        {
+            x--;
+        }
+
+        if (log2EleBytesMod3 > 1)
+        {
+            z--;
+        }
+
+        pExtent->width  = 1u << x;
+        pExtent->height = 1u << y;
+        pExtent->depth  = 1u << z;
+    }
+    else
+    {
+        const UINT_32 log2Samples = Log2(Max(numSamples, 1u));
+        const UINT_32 log2Width   = (log2BlkSize  >> 1)  -
+                                    (log2EleBytes >> 1)  -
+                                    (log2Samples  >> 1)  -
+                                    (log2EleBytes & log2Samples & 1);
+        const UINT_32 log2Height  = (log2BlkSize  >> 1)  -
+                                    (log2EleBytes >> 1)  -
+                                    (log2Samples  >> 1)  -
+                                    ((log2EleBytes | log2Samples) & 1);
+
+        // Return the extent in actual units, not log2
+        pExtent->width  = 1u << log2Width;
+        pExtent->height = 1u << log2Height;
+        pExtent->depth  = 1;
+    }
+}
+
+/**
+************************************************************************************************************************
+*   Lib::GetMipTailDim
+*
+*   @brief
+*       Internal function to get out max dimension of first level in mip tail
+*
+*   @return
+*       Max Width/Height/Depth value of the first mip fitted in mip tail
+************************************************************************************************************************
+*/
+ADDR_EXTENT3D Lib::GetMipTailDim(
+    Addr3SwizzleMode      swizzleMode,
+    const ADDR_EXTENT3D&  blockDims
+    ) const
+{
+    const UINT_32 log2BlkSize = GetBlockSizeLog2(swizzleMode);
+
+    ADDR_EXTENT3D  out = blockDims;
+
+    if (Is3dSwizzle(swizzleMode))
+    {
+        const UINT_32 dim = log2BlkSize % 3;
+
+        if (dim == 0)
+        {
+            out.height >>= 1;
+        }
+        else if (dim == 1)
+        {
+            out.width >>= 1;
+        }
+        else
+        {
+            out.depth >>= 1;
+        }
+    }
+    else
+    {
+        if ((log2BlkSize % 2) == 0)
+        {
+            out.width >>= 1;
+        }
+        else
+        {
+            out.height >>= 1;
+        }
+    }
+
+    return out;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Interface function stub of ComputeSurfaceAddrFromCoord.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoord(
+    const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT localIn = *pIn;
+    localIn.unAlignedDims.width  = Max(pIn->unAlignedDims.width,  1u);
+    localIn.unAlignedDims.height = Max(pIn->unAlignedDims.height, 1u);
+    localIn.unAlignedDims.depth  = Max(pIn->unAlignedDims.depth,  1u);
+    localIn.numMipLevels         = Max(pIn->numMipLevels,         1u);
+    localIn.numSamples           = Max(pIn->numSamples,           1u);
+
+    if ((localIn.bpp < 8)                               ||
+        (localIn.bpp > 128)                             ||
+        ((localIn.bpp % 8) != 0)                        ||
+        (localIn.sample >= localIn.numSamples)          ||
+        (localIn.slice >= localIn.unAlignedDims.depth)  ||
+        (localIn.mipId >= localIn.numMipLevels)         ||
+        (IsTex3d(localIn.resourceType)                  &&
+        (Valid3DMipSliceIdConstraint(localIn.unAlignedDims.depth, localIn.mipId, localIn.slice) == FALSE)))
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        if (IsLinear(localIn.swizzleMode))
+        {
+            returnCode = ComputeSurfaceAddrFromCoordLinear(&localIn, pOut);
+        }
+        else
+        {
+            returnCode = ComputeSurfaceAddrFromCoordTiled(&localIn, pOut);
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->prtBlockIndex = static_cast<UINT_32>(pOut->addr / (64 * 1024));
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Interface function stub of Addr3ComputePipeBankXor.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputePipeBankXor(
+    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
+    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut)
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    if ((GetFillSizeFieldsFlags() == TRUE) &&
+        ((pIn->size  != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_INPUT)) ||
+         (pOut->size != sizeof(ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT))))
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        returnCode = HwlComputePipeBankXor(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSurfaceAddrFromCoordLinear
+*
+*   @brief
+*       Internal function to calculate address from coord for linear swizzle surface
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordLinear(
+     const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+     ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+    BOOL_32 valid = (pIn->numSamples <= 1);
+
+    if (valid)
+    {
+        if (IsTex1d(pIn->resourceType))
+        {
+            valid = (pIn->y == 0);
+        }
+    }
+
+    if (valid)
+    {
+        ADDR3_COMPUTE_SURFACE_INFO_INPUT  localIn  = {0};
+        ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0};
+        ADDR3_MIP_INFO                    mipInfo[MaxMipLevels];
+        ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels);
+
+        localIn.size         = sizeof(localIn);
+        localIn.flags        = pIn->flags;
+        localIn.swizzleMode  = ADDR3_LINEAR;
+        localIn.resourceType = pIn->resourceType;
+        localIn.format       = ADDR_FMT_INVALID;
+        localIn.bpp          = pIn->bpp;
+        localIn.width        = Max(pIn->unAlignedDims.width,  1u);
+        localIn.height       = Max(pIn->unAlignedDims.height, 1u);
+        localIn.numSlices    = Max(pIn->unAlignedDims.depth,  1u);
+        localIn.numMipLevels = Max(pIn->numMipLevels,         1u);
+        localIn.numSamples   = Max(pIn->numSamples,           1u);
+
+        if (localIn.numMipLevels <= 1)
+        {
+            localIn.pitchInElement = pIn->pitchInElement;
+        }
+
+        localOut.size     = sizeof(localOut);
+        localOut.pMipInfo = mipInfo;
+
+        returnCode = ComputeSurfaceInfo(&localIn, &localOut);
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->addr        = (localOut.sliceSize * pIn->slice) +
+                                mipInfo[pIn->mipId].offset +
+                                (pIn->y * mipInfo[pIn->mipId].pitch + pIn->x) * (pIn->bpp >> 3);
+            pOut->bitPosition = 0;
+        }
+        else
+        {
+            valid = FALSE;
+        }
+    }
+
+    if (valid == FALSE)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSurfaceAddrFromCoordTiled
+*
+*   @brief
+*       Internal function to calculate address from coord for tiled swizzle surface
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeSurfaceAddrFromCoordTiled(
+     const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+     ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    return HwlComputeSurfaceAddrFromCoordTiled(pIn, pOut);
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeNonBlockCompressedView
+*
+*   @brief
+*       Interface function stub of Addr3ComputeNonBlockCompressedView.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeNonBlockCompressedView(
+    const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,
+    ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut)
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    if ((GetFillSizeFieldsFlags() == TRUE) &&
+        ((pIn->size  != sizeof(ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT)) ||
+         (pOut->size != sizeof(ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT))))
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+    else if (Is3dSwizzle(pIn->swizzleMode))
+    {
+        // 3D volume images using ADDR3_XX_3D is currently not supported.
+        returnCode = ADDR_NOTSUPPORTED;
+    }
+    else
+    {
+        returnCode = HwlComputeNonBlockCompressedView(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSubResourceOffsetForSwizzlePattern
+*
+*   @brief
+*       Interface function stub of Addr3ComputeSubResourceOffsetForSwizzlePattern.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeSubResourceOffsetForSwizzlePattern(
+    const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,
+    ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if ((GetFillSizeFieldsFlags() == TRUE) &&
+        ((pIn->size  != sizeof(ADDR2_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT)) ||
+         (pOut->size != sizeof(ADDR2_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT))))
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        HwlComputeSubResourceOffsetForSwizzlePattern(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ComputeSlicePipeBankXor
+*
+*   @brief
+*       Interface function stub of Addr3ComputeSlicePipeBankXor.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ComputeSlicePipeBankXor(
+    const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,
+    ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut)
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    if ((GetFillSizeFieldsFlags() == TRUE) &&
+        ((pIn->size  != sizeof(ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT)) ||
+         (pOut->size != sizeof(ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT))))
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+    if ((pIn->bpe != 0) &&
+        (pIn->bpe != 8) &&
+        (pIn->bpe != 16) &&
+        (pIn->bpe != 32) &&
+        (pIn->bpe != 64) &&
+        (pIn->bpe != 128))
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        returnCode = HwlComputeSlicePipeBankXor(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Lib::UseCustomHeight
+*
+*   @brief
+*       Determines if the calculations for this surface should use minimal HW values or user-specified values.
+*
+*   @return
+*       Returns TRUE if the user-specified alignment should be used
+************************************************************************************************************************
+*/
+BOOL_32 Lib::UseCustomHeight(
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn
+    ) const
+{
+    return ((pIn->numMipLevels <= 1)   &&
+            IsLinear(pIn->swizzleMode) &&
+            (pIn->sliceAlign > 0));
+}
+
+/**
+************************************************************************************************************************
+*   Lib::UseCustomPitch
+*
+*   @brief
+*       Determines if the calculations for this surface should use minimal HW values or user-specified values.
+*
+*   @return
+*       Returns TRUE if the user-specified pitch should be used
+************************************************************************************************************************
+*/
+BOOL_32 Lib::UseCustomPitch(
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn
+    ) const
+{
+    return ((pIn->numMipLevels <= 1)   &&
+            IsLinear(pIn->swizzleMode) &&
+            (pIn->pitchInElement > 0));
+}
+
+/**
+************************************************************************************************************************
+*   Lib::CanTrimLinearPadding
+*
+*   @brief
+*       Determines if the calculations for this surface can omit extra trailing padding for linear surfaces.
+*
+*   @return
+*       Returns TRUE if the trailing padding can be omitted.
+************************************************************************************************************************
+*/
+BOOL_32 Lib::CanTrimLinearPadding(
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn
+    ) const
+{
+    return ((IsTex3d(pIn->resourceType) == FALSE) &&
+            (pIn->numSlices <= 1)                 &&
+            IsLinear(pIn->swizzleMode));
+}
+
+/**
+************************************************************************************************************************
+*   Lib::ApplyCustomizedPitchHeight
+*
+*   @brief
+*       Helper function to override hw required row pitch/slice pitch by customrized one
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Lib::ApplyCustomizedPitchHeight(
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,    ///< [in] input structure
+    ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    const UINT_32  elementBytes = pIn->bpp >> 3;
+
+    // Calculate the default pitch/height without any user inputs
+    pOut->pitch  = PowTwoAlign(pIn->width,  pOut->blockExtent.width);
+    pOut->height = PowTwoAlign(pIn->height, pOut->blockExtent.height);
+
+    // Custom pitches / alignments are only possible with single mip level / linear images; otherwise,
+    // ignore those parameters.
+    if (UseCustomPitch(pIn))
+    {
+        const UINT_32  pitchAlignmentBytes    = 1 << GetBlockSizeLog2(pIn->swizzleMode, TRUE);
+        const UINT_32  pitchAlignmentElements = pitchAlignmentBytes / elementBytes;
+
+        // Their requested pitch has to meet the pitch alignment constraints applied by the HW.
+        if ((pIn->pitchInElement % pitchAlignmentElements) != 0)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        // And their pitch can't be less than the minimum
+        else if (pIn->pitchInElement < pOut->pitch)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            pOut->pitch = pIn->pitchInElement;
+        }
+    }
+
+    if ((returnCode == ADDR_OK) && UseCustomHeight(pIn))
+    {
+        UINT_32 customizedHeight = pIn->sliceAlign / elementBytes / pOut->pitch;
+
+        if (customizedHeight * elementBytes * pOut->pitch != pIn->sliceAlign)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else if ((pIn->numSlices > 1) && (pOut->height != customizedHeight))
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            pOut->height = customizedHeight;
+        }
+    }
+
+    return returnCode;
+}
+
+} // V3
+} // Addr
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/core/addrlib3.h b/src/image/addrlib/src/core/addrlib3.h
new file mode 100644
index 000000000..6b7a1818b
--- /dev/null
+++ b/src/image/addrlib/src/core/addrlib3.h
@@ -0,0 +1,417 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2023 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+
+
+/**
+************************************************************************************************************************
+* @file  addrlib3.h
+* @brief Contains the Addr::V3::Lib class definition.
+************************************************************************************************************************
+*/
+
+#ifndef __ADDR3_LIB3_H__
+#define __ADDR3_LIB3_H__
+
+#include "addrlib.h"
+
+namespace rocr {
+namespace Addr
+{
+namespace V3
+{
+
+/**
+************************************************************************************************************************
+* @brief Bitmasks for swizzle mode determination on GFX12
+************************************************************************************************************************
+*/
+const UINT_32 Gfx12Blk256KBSwModeMask = (1u << ADDR3_256KB_2D)  |
+                                        (1u << ADDR3_256KB_3D);
+
+const UINT_32 Gfx12Blk64KBSwModeMask  = (1u << ADDR3_64KB_2D)   |
+                                        (1u << ADDR3_64KB_3D);
+
+const UINT_32 Gfx12Blk4KBSwModeMask   = (1u << ADDR3_4KB_2D)    |
+                                        (1u << ADDR3_4KB_3D);
+
+const UINT_32 Gfx12Blk256BSwModeMask  = (1u << ADDR3_256B_2D);
+
+/**
+************************************************************************************************************************
+* @brief Bit setting for swizzle pattern
+************************************************************************************************************************
+*/
+union ADDR_BIT_SETTING
+{
+    struct
+    {
+        UINT_16 x;
+        UINT_16 y;
+        UINT_16 z;
+        UINT_16 s;
+    };
+    UINT_64 value;
+};
+
+/**
+************************************************************************************************************************
+* @brief Flags for SwizzleModeTable
+************************************************************************************************************************
+*/
+union SwizzleModeFlags
+{
+    struct
+    {
+        // Swizzle mode
+        UINT_32 isLinear        : 1;    // Linear
+        UINT_32 is2d            : 1;    // 2d mode
+        UINT_32 is3d            : 1;    // 3d mode
+
+        // Block size
+        UINT_32 is256b          : 1;    // Block size is 256B
+        UINT_32 is4kb           : 1;    // Block size is 4KB
+        UINT_32 is64kb          : 1;    // Block size is 64KB
+        UINT_32 is256kb         : 1;    // Block size is 256KB
+
+        UINT_32 reserved        : 25;   // Reserved bits
+    };
+
+    UINT_32 u32All;
+};
+
+struct Dim2d
+{
+    UINT_32 w;
+    UINT_32 h;
+};
+
+const UINT_32 Log2Size256  = 8u;
+const UINT_32 Log2Size4K   = 12u;
+const UINT_32 Log2Size64K  = 16u;
+const UINT_32 Log2Size256K = 18u;
+
+/**
+************************************************************************************************************************
+* @brief Swizzle pattern information
+************************************************************************************************************************
+*/
+// Accessed by index representing the logbase2 of (8bpp/16bpp/32bpp/64bpp/128bpp)
+// contains the indices which map to 2D arrays SW_PATTERN_NIBBLE[1-4] which contain sections of an index equation.
+struct ADDR_SW_PATINFO
+{
+    UINT_8 nibble1Idx;
+    UINT_8 nibble2Idx;
+    UINT_8 nibble3Idx;
+    UINT_8 nibble4Idx;
+};
+
+/**
+************************************************************************************************************************
+*   InitBit
+*
+*   @brief
+*       Initialize bit setting value via a return value
+************************************************************************************************************************
+*/
+#define InitBit(c, index) (1ull << ((c << 4) + index))
+
+const UINT_64 X0  = InitBit(0,  0);
+const UINT_64 X1  = InitBit(0,  1);
+const UINT_64 X2  = InitBit(0,  2);
+const UINT_64 X3  = InitBit(0,  3);
+const UINT_64 X4  = InitBit(0,  4);
+const UINT_64 X5  = InitBit(0,  5);
+const UINT_64 X6  = InitBit(0,  6);
+const UINT_64 X7  = InitBit(0,  7);
+const UINT_64 X8  = InitBit(0,  8);
+
+const UINT_64 Y0  = InitBit(1,  0);
+const UINT_64 Y1  = InitBit(1,  1);
+const UINT_64 Y2  = InitBit(1,  2);
+const UINT_64 Y3  = InitBit(1,  3);
+const UINT_64 Y4  = InitBit(1,  4);
+const UINT_64 Y5  = InitBit(1,  5);
+const UINT_64 Y6  = InitBit(1,  6);
+const UINT_64 Y7  = InitBit(1,  7);
+const UINT_64 Y8  = InitBit(1,  8);
+
+const UINT_64 Z0  = InitBit(2,  0);
+const UINT_64 Z1  = InitBit(2,  1);
+const UINT_64 Z2  = InitBit(2,  2);
+const UINT_64 Z3  = InitBit(2,  3);
+const UINT_64 Z4  = InitBit(2,  4);
+const UINT_64 Z5  = InitBit(2,  5);
+
+const UINT_64 S0  = InitBit(3,  0);
+const UINT_64 S1  = InitBit(3,  1);
+const UINT_64 S2  = InitBit(3,  2);
+
+/**
+************************************************************************************************************************
+* @brief Bit setting for swizzle pattern
+************************************************************************************************************************
+*/
+
+/**
+************************************************************************************************************************
+* @brief This class contains asic independent address lib functionalities
+************************************************************************************************************************
+*/
+class Lib : public Addr::Lib
+{
+public:
+    virtual ~Lib();
+
+    static Lib* GetLib(
+        ADDR_HANDLE hLib);
+
+    //
+    // Interface stubs
+    //
+
+    // For data surface
+    ADDR_E_RETURNCODE ComputeSurfaceInfo(
+        const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut) const;
+
+    ADDR_E_RETURNCODE GetPossibleSwizzleModes(
+        const ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT*   pIn,
+        ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT*        pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoord(
+        const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) const;
+
+    // Misc
+    ADDR_E_RETURNCODE ComputePipeBankXor(
+        const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
+        ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut);
+
+    ADDR_E_RETURNCODE ComputeNonBlockCompressedView(
+        const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,
+        ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut);
+
+    ADDR_E_RETURNCODE ComputeSubResourceOffsetForSwizzlePattern(
+        const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,
+        ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut);
+
+    ADDR_E_RETURNCODE ComputeSlicePipeBankXor(
+        const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,
+        ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut);
+
+protected:
+    Lib();  // Constructor is protected
+    Lib(const Client* pClient);
+
+    static const UINT_32 MaxImageDim  = 65536;
+    static const UINT_32 MaxMipLevels = 17; // Max image size is 64k
+    static const UINT_32 MaxNumOfBpp  = 5;
+    static const UINT_32 MaxNumOfAA   = 4;
+    UINT_32 m_pipesLog2;                ///< Number of pipe per shader engine Log2
+    UINT_32 m_pipeInterleaveLog2;       ///< Log2 of pipe interleave bytes
+
+    static const Dim2d         Block256_2d[MaxNumOfBpp];
+    static const ADDR_EXTENT3D Block1K_3d[MaxNumOfBpp];
+    SwizzleModeFlags m_swizzleModeTable[ADDR3_MAX_TYPE];  ///< Swizzle mode table
+
+    // Number of unique MSAA sample rates (1/2/4/8)
+    static const UINT_32 MaxMsaaRateLog2     = 4;
+    // Max number of bpp (8bpp/16bpp/32bpp/64bpp/128bpp)
+    static const UINT_32 MaxElementBytesLog2 = 5;
+    // Number of unique swizzle patterns (one entry per swizzle mode + MSAA + bpp configuration)
+    static const UINT_32 NumSwizzlePatterns  = 19 * MaxElementBytesLog2;
+
+    // Number of equation entries in the table
+    UINT_32              m_numEquations;
+    // Equation lookup table according to swizzle mode, MSAA sample rate, and bpp
+    UINT_32              m_equationLookupTable[ADDR3_MAX_TYPE - 1][MaxMsaaRateLog2][MaxElementBytesLog2];
+
+    // Equation table
+    ADDR_EQUATION        m_equationTable[NumSwizzlePatterns];
+
+    void SetEquationTableEntry(
+        Addr3SwizzleMode addrType,
+        UINT_32          msaaLog2,
+        UINT_32          elementLog2,
+        UINT_32          value)
+    {
+        m_equationLookupTable[addrType - 1][msaaLog2][elementLog2] = value;
+    }
+
+    const UINT_32 GetEquationTableEntry(
+        Addr3SwizzleMode addrType,
+        UINT_32          msaaLog2,
+        UINT_32          elementLog2) const
+    {
+        return m_equationLookupTable[addrType - 1][msaaLog2][elementLog2];
+    }
+
+    static BOOL_32 Valid3DMipSliceIdConstraint(
+        UINT_32 numSlices,
+        UINT_32 mipId,
+        UINT_32 slice)
+    {
+        return (Max((numSlices >> mipId), 1u) > slice);
+    }
+
+    UINT_32 GetBlockSize(
+        Addr3SwizzleMode  swizzleMode,
+        BOOL_32           forPitch = FALSE) const;
+
+    UINT_32 GetBlockSizeLog2(
+        Addr3SwizzleMode  swizzleMode,
+        BOOL_32           forPitch = FALSE) const;
+
+    BOOL_32 IsValidSwMode(Addr3SwizzleMode swizzleMode) const
+    {
+        return (m_swizzleModeTable[swizzleMode].u32All != 0);
+    }
+
+    UINT_32 IsLinear(Addr3SwizzleMode swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].isLinear;
+    }
+
+    // Checking block size
+    BOOL_32 IsBlock256b(Addr3SwizzleMode swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].is256b;
+    }
+
+    // Checking block size
+    BOOL_32 IsBlock4kb(Addr3SwizzleMode swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].is4kb;
+    }
+
+    // Checking block size
+    BOOL_32 IsBlock64kb(Addr3SwizzleMode swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].is64kb;
+    }
+
+    // Checking block size
+    BOOL_32 IsBlock256kb(Addr3SwizzleMode swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].is256kb;
+    }
+
+    BOOL_32  Is2dSwizzle(Addr3SwizzleMode  swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].is2d;
+    }
+
+    BOOL_32  Is3dSwizzle(Addr3SwizzleMode  swizzleMode) const
+    {
+        return m_swizzleModeTable[swizzleMode].is3d;
+    }
+
+    virtual UINT_32 HwlComputeMaxBaseAlignments() const  { return 256 * 1024; }
+
+    virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn)
+    {
+        ADDR_NOT_IMPLEMENTED();
+        // Although GFX12 addressing should be consistent regardless of the configuration, we still need to
+        // call some initialization for member variables.
+        return TRUE;
+    }
+
+    virtual ChipFamily HwlConvertChipFamily(
+        UINT_32 chipFamily,
+        UINT_32 chipRevision);
+
+    virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const { return 0; }
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+         const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,
+         ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTSUPPORTED;
+    }
+
+    virtual ADDR_E_RETURNCODE HwlComputePipeBankXor(
+        const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
+        ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTSUPPORTED;
+    }
+
+    VOID ComputeBlockDimensionForSurf(
+        ADDR_EXTENT3D*    pExtent,
+        UINT_32           bpp,
+        UINT_32           numSamples,
+        Addr3SwizzleMode  swizzleMode) const;
+
+    ADDR_EXTENT3D GetMipTailDim(
+        Addr3SwizzleMode      swizzleMode,
+        const ADDR_EXTENT3D&  blockDims) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoordLinear(
+        const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoordTiled(
+        const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoordTiled(
+        const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTIMPLEMENTED;
+    }
+
+    virtual ADDR_E_RETURNCODE HwlComputeNonBlockCompressedView(
+        const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,
+        ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTSUPPORTED;
+    }
+
+    virtual VOID HwlComputeSubResourceOffsetForSwizzlePattern(
+        const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,
+        ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+    }
+
+    virtual ADDR_E_RETURNCODE HwlComputeSlicePipeBankXor(
+        const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,
+        ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut) const
+    {
+        ADDR_NOT_IMPLEMENTED();
+        return ADDR_NOTSUPPORTED;
+    }
+
+    ADDR_E_RETURNCODE ApplyCustomizedPitchHeight(
+        const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut) const;
+
+    BOOL_32 UseCustomHeight(const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn) const;
+    BOOL_32 UseCustomPitch(const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn) const;
+    BOOL_32 CanTrimLinearPadding(const ADDR3_COMPUTE_SURFACE_INFO_INPUT*  pIn) const;
+
+private:
+    // Disallow the copy constructor
+    Lib(const Lib& a);
+
+    // Disallow the assignment operator
+    Lib& operator=(const Lib& a);
+
+    void Init();
+};
+
+} // V3
+} // Addr
+} // namespace rocr
+
+#endif
\ No newline at end of file
diff --git a/src/image/addrlib/src/core/addrobject.cpp b/src/image/addrlib/src/core/addrobject.cpp
index 2a08b0ae0..f3d3fff27 100644
--- a/src/image/addrlib/src/core/addrobject.cpp
+++ b/src/image/addrlib/src/core/addrobject.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -35,7 +18,8 @@
 #include "addrobject.h"
 
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ****************************************************************************************************
@@ -237,4 +221,4 @@ VOID Object::DebugPrint(
 }
 
 } // Addr
-} // rocr
+} // namespace rocr
diff --git a/src/image/addrlib/src/core/addrobject.h b/src/image/addrlib/src/core/addrobject.h
index 0d270789a..57205e5b2 100644
--- a/src/image/addrlib/src/core/addrobject.h
+++ b/src/image/addrlib/src/core/addrobject.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -38,7 +21,8 @@
 #include "addrcommon.h"
 
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ****************************************************************************************************
@@ -91,6 +75,5 @@ class Object
 };
 
 } // Addr
-} // rocr
-
+} // namespace rocr
 #endif
diff --git a/src/image/addrlib/src/core/coord.cpp b/src/image/addrlib/src/core/coord.cpp
index f371458f4..74644e75e 100644
--- a/src/image/addrlib/src/core/coord.cpp
+++ b/src/image/addrlib/src/core/coord.cpp
@@ -3,24 +3,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -29,8 +12,10 @@
 #include "coord.h"
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 
 Coordinate::Coordinate()
 {
@@ -600,4 +585,4 @@ BOOL_32 CoordEq::operator!=(const CoordEq& b)
 
 } // V2
 } // Addr
-} // rocr
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/core/coord.h b/src/image/addrlib/src/core/coord.h
index 490823f3f..95c4fca68 100644
--- a/src/image/addrlib/src/core/coord.h
+++ b/src/image/addrlib/src/core/coord.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -29,8 +12,10 @@
 #define __COORD_H
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 #if defined(__cplusplus)
 #if defined(_MSC_VER)
     #if _MSC_VER >= 1900
@@ -140,7 +125,6 @@ class CoordEq
 
 } // V2
 } // Addr
-} // rocr
-
+} // namespace rocr
 #endif
 
diff --git a/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h b/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h
index 3a783bb4b..f476b3984 100644
--- a/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h
+++ b/src/image/addrlib/src/gfx10/gfx10SwizzlePattern.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -34,9 +17,10 @@
 #define __GFX10_SWIZZLE_PATTERN_H__
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
-
+namespace Addr
+{
+namespace V2
+{
 const ADDR_SW_PATINFO GFX10_SW_256_S_PATINFO[] =
 {
     {   1,    0,    0,    0,    0, } , // 1 pipes 1 bpe @ SW_256_S @ Navi1x
@@ -6031,7 +6015,8 @@ const UINT_64 GFX10_CMASK_SW_PATTERN[][17] =
     {X3,            Y3,            X7,            Y7,            X8,            Y8,            X9,            Y9,            X10,           Y4^X9^Y9,      Z3^X4^Y4,      Z2^Y5^X8,      Z1^X5^Y8,      Y6^X7,         Z0^X6^Y7,      0,             0,             }, //34
 };
 
-} // V2
+}// V2
 } // Addr
-} // rocr
+} // namespace rocr
+
 #endif
diff --git a/src/image/addrlib/src/gfx10/gfx10addrlib.cpp b/src/image/addrlib/src/gfx10/gfx10addrlib.cpp
index 733252f88..324697b73 100644
--- a/src/image/addrlib/src/gfx10/gfx10addrlib.cpp
+++ b/src/image/addrlib/src/gfx10/gfx10addrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -31,15 +14,16 @@
 */
 
 #include "gfx10addrlib.h"
+#include "addrcommon.h"
 #include "gfx10_gb_reg.h"
 
 #include "amdgpu_asic_addr.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 /**
 ************************************************************************************************************************
 *   Gfx10HwlInit
@@ -93,7 +77,7 @@ const SwizzleModeFlags Gfx10Lib::SwizzleModeTable[ADDR_SW_MAX_TYPE] =
     {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
     {{0,    0,    1,    0,    0,    0,    1,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_S_X
     {{0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_D_X
-    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    1,    0,    0,    0,    0,    0,    0,    1,    0,    1,    0}}, // ADDR_SW_4KB_R_X
 
     {{0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_Z_X
     {{0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_S_X
@@ -128,6 +112,7 @@ Gfx10Lib::Gfx10Lib(const Client* pClient)
     m_numSaLog2(0),
     m_colorBaseIndex(0),
     m_xmaskBaseIndex(0),
+    m_htileBaseIndex(0),
     m_dccBaseIndex(0)
 {
     memset(&m_settings, 0, sizeof(m_settings));
@@ -675,7 +660,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeHtileAddrFromCoord(
         {
             const UINT_32  numSampleLog2 = Log2(pIn->numSamples);
             const UINT_32  pipeMask      = (1 << m_pipesLog2) - 1;
-            const UINT_32  index         = m_xmaskBaseIndex + numSampleLog2;
+            const UINT_32  index         = m_htileBaseIndex + numSampleLog2;
             const UINT_8*  patIdxTable   = m_settings.supportRbPlus ? GFX10_HTILE_RBPLUS_PATIDX : GFX10_HTILE_PATIDX;
 
             const UINT_32  blkSizeLog2   = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) - 4;
@@ -948,9 +933,11 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams(
 
     {
         // Skip unaligned case
-        m_xmaskBaseIndex += MaxNumOfAA;
+        m_xmaskBaseIndex += MaxNumOfBppCMask;
+        m_htileBaseIndex += MaxNumOfAA;
 
-        m_xmaskBaseIndex += m_pipesLog2 * MaxNumOfAA;
+        m_xmaskBaseIndex += m_pipesLog2 * MaxNumOfBppCMask;
+        m_htileBaseIndex += m_pipesLog2 * MaxNumOfAA;
         m_colorBaseIndex += m_pipesLog2 * MaxNumOfBpp;
 
         if (m_settings.supportRbPlus)
@@ -966,7 +953,8 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams(
             if (m_numPkrLog2 >= 2)
             {
                 m_colorBaseIndex += (2 * m_numPkrLog2 - 2) * MaxNumOfBpp;
-                m_xmaskBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfAA;
+                m_xmaskBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfBppCMask;
+                m_htileBaseIndex += (m_numPkrLog2 - 1) * 3 * MaxNumOfAA;
             }
         }
         else
@@ -976,9 +964,8 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams(
                                         1;
 
             ADDR_C_ASSERT(sizeof(GFX10_HTILE_PATIDX) / sizeof(GFX10_HTILE_PATIDX[0]) == (numPipeType + 1) * MaxNumOfAA);
-
-            ADDR_C_ASSERT(sizeof(GFX10_HTILE_PATIDX) / sizeof(GFX10_HTILE_PATIDX[0]) ==
-                          sizeof(GFX10_CMASK_64K_PATIDX) / sizeof(GFX10_CMASK_64K_PATIDX[0]));
+            ADDR_C_ASSERT(sizeof(GFX10_CMASK_64K_PATIDX) / sizeof(GFX10_CMASK_64K_PATIDX[0]) ==
+                          (numPipeType + 1) * MaxNumOfBppCMask);
         }
     }
 
@@ -1071,7 +1058,6 @@ ChipFamily Gfx10Lib::HwlConvertChipFamily(
                 ADDR_ASSERT(!"Unknown chip revision");
             }
             break;
-
         case FAMILY_RMB:
             if (ASICREV_IS_REMBRANDT(chipRevision))
             {
@@ -1083,15 +1069,15 @@ ChipFamily Gfx10Lib::HwlConvertChipFamily(
                 ADDR_ASSERT(!"Unknown chip revision");
             }
             break;
-        case FAMILY_GC_10_3_6:
-            if (ASICREV_IS_GFX1036(chipRevision))
+        case FAMILY_RPL:
+            if (ASICREV_IS_RAPHAEL(chipRevision))
             {
                 m_settings.supportRbPlus   = 1;
                 m_settings.dccUnsup3DSwDis = 0;
             }
             break;
-        case FAMILY_GC_10_3_7:
-            if (ASICREV_IS_GFX1037(chipRevision))
+        case FAMILY_MDN:
+            if (ASICREV_IS_MENDOCINO(chipRevision))
             {
                 m_settings.supportRbPlus   = 1;
                 m_settings.dccUnsup3DSwDis = 0;
@@ -1460,13 +1446,15 @@ VOID Gfx10Lib::ConvertSwizzlePatternToEquation(
     ADDR_EQUATION*         pEquation) ///< [out] equation converted from swizzle pattern
     const
 {
-    ADDR_BIT_SETTING fullSwizzlePattern[20];
+    // Get full swizzle pattern and store it as an ADDR_BIT_SETTING list
+    ADDR_BIT_SETTING fullSwizzlePattern[ADDR_MAX_EQUATION_BIT];
     GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
 
     const ADDR_BIT_SETTING* pSwizzle      = fullSwizzlePattern;
     const UINT_32           blockSizeLog2 = GetBlockSizeLog2(swMode);
-
+    memset(pEquation, 0, sizeof(ADDR_EQUATION));
     pEquation->numBits            = blockSizeLog2;
+    pEquation->numBitComponents   = pPatInfo->maxItemCount;
     pEquation->stackedDepthSlices = FALSE;
 
     for (UINT_32 i = 0; i < elemLog2; i++)
@@ -1994,37 +1982,45 @@ VOID Gfx10Lib::InitEquationTable()
 {
     memset(m_equationTable, 0, sizeof(m_equationTable));
 
+    // Iterate through resourceTypes, up to MaxRsrcType where a "resourceType" refers to AddrResourceType (1D/2D/3D)
+    // resources. This starts with rsrcTypeIdx = 0, however there is an offset added that will start us off at
+    // computing 2D resources.
     for (UINT_32 rsrcTypeIdx = 0; rsrcTypeIdx < MaxRsrcType; rsrcTypeIdx++)
     {
+        // Add offset. Start iterating from ADDR_RSRC_TEX_2D
         const AddrResourceType rsrcType = static_cast<AddrResourceType>(rsrcTypeIdx + ADDR_RSRC_TEX_2D);
 
+        // Iterate through the maximum number of swizzlemodes a type can hold
         for (UINT_32 swModeIdx = 0; swModeIdx < MaxSwModeType; swModeIdx++)
         {
             const AddrSwizzleMode swMode = static_cast<AddrSwizzleMode>(swModeIdx);
 
+            // Iterate through the different bits-per-pixel settings (8bpp/16bpp/32bpp/64bpp/128bpp)
             for (UINT_32 elemLog2 = 0; elemLog2 < MaxElementBytesLog2; elemLog2++)
             {
                 UINT_32                equationIndex = ADDR_INVALID_EQUATION_INDEX;
+                // May or may not return a ADDR_SW_PATINFO for a completely different swizzle mode, essentially
+                // overwriting the choice.
                 const ADDR_SW_PATINFO* pPatInfo      = GetSwizzlePatternInfo(swMode, rsrcType, elemLog2, 1);
 
                 if (pPatInfo != NULL)
                 {
                     ADDR_ASSERT(IsValidSwMode(swMode));
-
-                    if (pPatInfo->maxItemCount <= 3)
+                    if (pPatInfo->maxItemCount <= 3) // Get a valid equationIndex
                     {
                         ADDR_EQUATION equation = {};
 
+                        // Passing in pPatInfo to get the addr equation
                         ConvertSwizzlePatternToEquation(elemLog2, rsrcType, swMode, pPatInfo, &equation);
 
                         equationIndex = m_numEquations;
                         ADDR_ASSERT(equationIndex < EquationTableSize);
-
+                        // Updates m_equationTable[m_numEquations] to be the addr equation for this PatInfo
                         m_equationTable[equationIndex] = equation;
-
+                        // Increment m_numEquations
                         m_numEquations++;
                     }
-                    else
+                    else // There is no equationIndex
                     {
                         // We only see "ill" equation from 64/128 BPE + 3D resource + SW_64KB_D_X under RB+ case
                         ADDR_ASSERT((elemLog2 == 3) || (elemLog2 == 4));
@@ -2033,7 +2029,8 @@ VOID Gfx10Lib::InitEquationTable()
                         ADDR_ASSERT(m_settings.supportRbPlus == 1);
                     }
                 }
-
+                // equationIndex, which is used to look up equations in m_equationTable, will be cached for every
+                // iteration in this nested for-loop
                 m_equationLookupTable[rsrcTypeIdx][swModeIdx][elemLog2] = equationIndex;
             }
         }
@@ -2318,9 +2315,9 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeNonBlockCompressedView(
 {
     ADDR_E_RETURNCODE returnCode = ADDR_OK;
 
-    if (pIn->resourceType != ADDR_RSRC_TEX_2D)
+    if (IsThin(pIn->resourceType, pIn->swizzleMode) == FALSE)
     {
-        // Only 2D resource can have a NonBC view...
+        // Only thin swizzle mode can have a NonBC view...
         returnCode = ADDR_INVALIDPARAMS;
     }
     else if (((pIn->format < ADDR_FMT_ASTC_4x4) || (pIn->format > ADDR_FMT_ETC2_128BPP)) &&
@@ -2347,6 +2344,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeNonBlockCompressedView(
         infoIn.numFrags     = 1;
 
         ADDR2_MIP_INFO mipInfo[MaxMipLevels] = {};
+        ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels);
 
         ADDR2_COMPUTE_SURFACE_INFO_OUTPUT infoOut = {};
         infoOut.pMipInfo = mipInfo;
@@ -2597,6 +2595,7 @@ BOOL_32 Gfx10Lib::ValidateSwModeParams(
     const BOOL_32             linear      = IsLinear(swizzle);
     const BOOL_32             blk256B     = IsBlock256b(swizzle);
     const BOOL_32             blkVar      = IsBlockVariable(swizzle);
+    const BOOL_32             isNonPrtXor = IsNonPrtXor(swizzle);
     const BOOL_32             prt         = flags.prt;
     const BOOL_32             fmask       = flags.fmask;
 
@@ -2652,7 +2651,7 @@ BOOL_32 Gfx10Lib::ValidateSwModeParams(
     {
         if (((swizzleMask & Gfx10Rsrc3dSwModeMask) == 0) ||
             (prt && ((swizzleMask & Gfx10Rsrc3dPrtSwModeMask) == 0)) ||
-            (thin3d && ((swizzleMask & Gfx10Rsrc3dThinSwModeMask) == 0)))
+            (thin3d && ((swizzleMask & Gfx10Rsrc3dViewAs2dSwModeMask) == 0)))
         {
             ADDR_ASSERT_ALWAYS();
             valid = FALSE;
@@ -2755,7 +2754,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceInfoSanityCheck(
 *   Gfx10Lib::HwlGetPreferredSurfaceSetting
 *
 *   @brief
-*       Internal function to get suggested surface information for cliet to use
+*       Internal function to get suggested surface information for client to use
 *
 *   @return
 *       ADDR_E_RETURNCODE
@@ -2824,7 +2823,7 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                     padSize[i] = PowTwoAlign(padSize[i], sizeAlignInElement);
                 }
 
-                if (BlockTypeWithinMemoryBudget(padSize[0],
+                if (Addr2BlockTypeWithinMemoryBudget(padSize[0],
                                                 padSize[1],
                                                 ratioLow,
                                                 ratioHi,
@@ -2969,7 +2968,8 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
 
                     if (pIn->flags.view3dAs2dArray)
                     {
-                        allowedSwModeSet.value &= Gfx10Rsrc3dThinSwModeMask;
+                        // SW_LINEAR can be used for 3D thin images, including BCn image format.
+                        allowedSwModeSet.value &= Gfx10Rsrc3dViewAs2dSwModeMask;
                     }
                     break;
 
@@ -3057,7 +3057,9 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
 
                 if (pIn->flags.needEquation)
                 {
-                    FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3));
+                    UINT_32 components = pIn->flags.allowExtEquation ?  ADDR_MAX_EQUATION_COMP :
+                                                                        ADDR_MAX_LEGACY_EQUATION_COMP;
+                    FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components);
                 }
 
                 if (allowedSwModeSet.value == Gfx10LinearSwModeMask)
@@ -3076,11 +3078,13 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                         allowedSwModeSet.swLinear = 0;
                     }
 
+                    // A bitfield where each bit represents a block type. Each swizzle mode maps to a block.
                     ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType);
 
                     // Determine block size if there are 2 or more block type candidates
                     if (IsPow2(allowedBlockSet.value) == FALSE)
                     {
+                        // Tracks a valid SwizzleMode for each valid block type
                         AddrSwizzleMode swMode[AddrBlockMaxTiledType] = {};
 
                         swMode[AddrBlockLinear] = ADDR_SW_LINEAR;
@@ -3103,18 +3107,21 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                             swMode[AddrBlockThin64KB] = ADDR_SW_64KB_S;
                         }
 
+                        // Tracks the size of each valid swizzle mode's surface in bytes
                         UINT_64 padSize[AddrBlockMaxTiledType] = {};
 
                         const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
                         const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
-                        UINT_32       minSizeBlk         = AddrBlockMicro;
-                        UINT_64       minSize            = 0;
+                        const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
+                        UINT_32       minSizeBlk         = AddrBlockMicro; // Tracks the most optimal block to use
+                        UINT_64       minSize            = 0;              // Tracks the minimum acceptable block type
 
                         ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
 
+                        // Iterate through all block types
                         for (UINT_32 i = AddrBlockLinear; i < AddrBlockMaxTiledType; i++)
                         {
-                            if (IsBlockTypeAvaiable(allowedBlockSet, static_cast<AddrBlockType>(i)))
+                            if (Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<rocr::AddrBlockType>(i)))
                             {
                                 localIn.swizzleMode = swMode[i];
 
@@ -3138,7 +3145,8 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                                     }
                                     else
                                     {
-                                        if (BlockTypeWithinMemoryBudget(
+                                        // Checks if the block type is within the memory budget but favors larger blocks
+                                        if (Addr2BlockTypeWithinMemoryBudget(
                                                 minSize,
                                                 padSize[i],
                                                 ratioLow,
@@ -3187,9 +3195,9 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
                             for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
                             {
                                 if ((i != minSizeBlk) &&
-                                    IsBlockTypeAvaiable(allowedBlockSet, static_cast<AddrBlockType>(i)))
+                                    Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<rocr::AddrBlockType>(i)))
                                 {
-                                    if (BlockTypeWithinMemoryBudget(
+                                    if (Addr2BlockTypeWithinMemoryBudget(
                                             minSize,
                                             padSize[i],
                                             0,
@@ -3679,6 +3687,7 @@ ADDR_E_RETURNCODE Gfx10Lib::ComputeSurfaceInfoMacroTiled(
                 UINT_64       mipSize[MaxMipLevels];
                 UINT_64       mipSliceSize[MaxMipLevels];
 
+                ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels);
                 Dim3d fixedTailMaxDim = tailMaxDim;
 
                 if (m_settings.dsMipmapHtileFix && IsZOrderSwizzle(pIn->swizzleMode) && (index <= 1))
@@ -3895,54 +3904,23 @@ UINT_32 Gfx10Lib::ComputeOffsetFromEquation(
     {
         UINT_32 v = 0;
 
-        if (pEq->addr[i].valid)
+        for (UINT_32 c = 0; c < pEq->numBitComponents; c++)
         {
-            if (pEq->addr[i].channel == 0)
+            if (pEq->comps[c][i].valid)
             {
-                v ^= (x >> pEq->addr[i].index) & 1;
-            }
-            else if (pEq->addr[i].channel == 1)
-            {
-                v ^= (y >> pEq->addr[i].index) & 1;
-            }
-            else
-            {
-                ADDR_ASSERT(pEq->addr[i].channel == 2);
-                v ^= (z >> pEq->addr[i].index) & 1;
-            }
-        }
-
-        if (pEq->xor1[i].valid)
-        {
-            if (pEq->xor1[i].channel == 0)
-            {
-                v ^= (x >> pEq->xor1[i].index) & 1;
-            }
-            else if (pEq->xor1[i].channel == 1)
-            {
-                v ^= (y >> pEq->xor1[i].index) & 1;
-            }
-            else
-            {
-                ADDR_ASSERT(pEq->xor1[i].channel == 2);
-                v ^= (z >> pEq->xor1[i].index) & 1;
-            }
-        }
-
-        if (pEq->xor2[i].valid)
-        {
-            if (pEq->xor2[i].channel == 0)
-            {
-                v ^= (x >> pEq->xor2[i].index) & 1;
-            }
-            else if (pEq->xor2[i].channel == 1)
-            {
-                v ^= (y >> pEq->xor2[i].index) & 1;
-            }
-            else
-            {
-                ADDR_ASSERT(pEq->xor2[i].channel == 2);
-                v ^= (z >> pEq->xor2[i].index) & 1;
+                if (pEq->comps[c][i].channel == 0)
+                {
+                    v ^= (x >> pEq->comps[c][i].index) & 1;
+                }
+                else if (pEq->comps[c][i].channel == 1)
+                {
+                    v ^= (y >> pEq->comps[c][i].index) & 1;
+                }
+                else
+                {
+                    ADDR_ASSERT(pEq->comps[c][i].channel == 2);
+                    v ^= (z >> pEq->comps[c][i].index) & 1;
+                }
             }
         }
 
@@ -4071,6 +4049,8 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo(
     UINT_32          numFrag            ///< Number of fragment
     ) const
 {
+    // Now elemLog2 is going to be used to access the correct index insode of the pPatInfo array so we will start from
+    // the right location
     const UINT_32          index       = IsXor(swizzleMode) ? (m_colorBaseIndex + elemLog2) : elemLog2;
     const ADDR_SW_PATINFO* patInfo     = NULL;
     const UINT_32          swizzleMask = 1 << swizzleMode;
@@ -4133,8 +4113,15 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo(
             {
                 if (IsRtOptSwizzle(swizzleMode))
                 {
-                    patInfo = m_settings.supportRbPlus ?
-                              GFX10_SW_64K_R_X_1xaa_RBPLUS_PATINFO : GFX10_SW_64K_R_X_1xaa_PATINFO;
+                    if (swizzleMode == ADDR_SW_4KB_R_X)
+                    {
+                        patInfo = NULL;
+                    }
+                    else
+                    {
+                        patInfo = m_settings.supportRbPlus ?
+                                  GFX10_SW_64K_R_X_1xaa_RBPLUS_PATINFO : GFX10_SW_64K_R_X_1xaa_PATINFO;
+                    }
                 }
                 else if (IsZOrderSwizzle(swizzleMode))
                 {
@@ -4228,6 +4215,10 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo(
                             patInfo = m_settings.supportRbPlus ?
                                       GFX10_SW_4K_D_RBPLUS_PATINFO : GFX10_SW_4K_D_PATINFO;
                         }
+                        else if (swizzleMode == ADDR_SW_4KB_R_X)
+                        {
+                            patInfo = NULL;
+                        }
                         else
                         {
                             ADDR_ASSERT(swizzleMode == ADDR_SW_4KB_D_X);
@@ -4351,6 +4342,7 @@ ADDR_E_RETURNCODE Gfx10Lib::ComputeSurfaceAddrFromCoordMicroTiled(
     ADDR2_COMPUTE_SURFACE_INFO_INPUT  localIn  = {};
     ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
     ADDR2_MIP_INFO                    mipInfo[MaxMipLevels];
+    ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels);
 
     localIn.swizzleMode  = pIn->swizzleMode;
     localIn.flags        = pIn->flags;
@@ -4417,6 +4409,7 @@ ADDR_E_RETURNCODE Gfx10Lib::ComputeSurfaceAddrFromCoordMacroTiled(
     ADDR2_COMPUTE_SURFACE_INFO_INPUT  localIn  = {};
     ADDR2_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
     ADDR2_MIP_INFO                    mipInfo[MaxMipLevels];
+    ADDR_ASSERT(pIn->numMipLevels <= MaxMipLevels);
 
     localIn.swizzleMode  = pIn->swizzleMode;
     localIn.flags        = pIn->flags;
@@ -4809,4 +4802,4 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceInfoLinear(
 
 } // V2
 } // Addr
-} // rocr
+} // namespace rocr
diff --git a/src/image/addrlib/src/gfx10/gfx10addrlib.h b/src/image/addrlib/src/gfx10/gfx10addrlib.h
index 542d51a43..9dbaefe0e 100644
--- a/src/image/addrlib/src/gfx10/gfx10addrlib.h
+++ b/src/image/addrlib/src/gfx10/gfx10addrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -38,8 +21,10 @@
 #include "gfx10SwizzlePattern.h"
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 
 /**
 ************************************************************************************************************************
@@ -57,8 +42,8 @@ struct Gfx10ChipSettings
         UINT_32 supportRbPlus       : 1;
         UINT_32 dsMipmapHtileFix    : 1;
         UINT_32 dccUnsup3DSwDis     : 1;
-        UINT_32                     : 2;
-        UINT_32 reserved2           : 26;
+        UINT_32                     : 4;
+        UINT_32 reserved2           : 24;
     };
 };
 
@@ -158,7 +143,11 @@ const UINT_32 Gfx10Rsrc3dPrtSwModeMask = Gfx10Rsrc2dPrtSwModeMask & ~Gfx10Displa
 const UINT_32 Gfx10Rsrc3dThin64KBSwModeMask = (1u << ADDR_SW_64KB_Z_X) |
                                               (1u << ADDR_SW_64KB_R_X);
 
-const UINT_32 Gfx10Rsrc3dThinSwModeMask = Gfx10Rsrc3dThin64KBSwModeMask | Gfx10BlkVarSwModeMask;
+
+const UINT_32 Gfx10Rsrc3dThinSwModeMask = Gfx10Rsrc3dThin64KBSwModeMask |
+                                          Gfx10BlkVarSwModeMask;
+
+const UINT_32 Gfx10Rsrc3dViewAs2dSwModeMask = Gfx10Rsrc3dThinSwModeMask | Gfx10LinearSwModeMask;
 
 const UINT_32 Gfx10Rsrc3dThickSwModeMask = Gfx10Rsrc3dSwModeMask & ~(Gfx10Rsrc3dThinSwModeMask | Gfx10LinearSwModeMask);
 
@@ -166,8 +155,9 @@ const UINT_32 Gfx10Rsrc3dThick4KBSwModeMask = Gfx10Rsrc3dThickSwModeMask & Gfx10
 
 const UINT_32 Gfx10Rsrc3dThick64KBSwModeMask = Gfx10Rsrc3dThickSwModeMask & Gfx10Blk64KBSwModeMask;
 
-const UINT_32 Gfx10MsaaSwModeMask = Gfx10ZSwModeMask |
-                                    Gfx10RenderSwModeMask;
+const UINT_32 Gfx10MsaaSwModeMask = (Gfx10ZSwModeMask       |
+                                     Gfx10RenderSwModeMask)
+                                    ;
 
 const UINT_32 Dcn20NonBpp64SwModeMask = (1u << ADDR_SW_LINEAR)   |
                                         (1u << ADDR_SW_4KB_S)    |
@@ -396,6 +386,12 @@ class Gfx10Lib : public Lib
         UINT_32          log2Elem,
         UINT_32          numFrag) const;
 
+    /**
+     * Will use the indices, "nibbles", to build an index equation inside pSwizzle
+     *
+     * @param pPatInfo Pointer to a patInfo. Contains indices mapping to the 2D nibble arrays which will be used to build an index equation.
+     * @param pSwizzle Array to write the index equation to.
+     */
     VOID GetSwizzlePatternFromPatternInfo(
         const ADDR_SW_PATINFO* pPatInfo,
         ADDR_BIT_SETTING       (&pSwizzle)[20]) const
@@ -570,12 +566,13 @@ class Gfx10Lib : public Lib
 
     UINT_32 m_colorBaseIndex;
     UINT_32 m_xmaskBaseIndex;
+    UINT_32 m_htileBaseIndex;
     UINT_32 m_dccBaseIndex;
 };
 
 } // V2
 } // Addr
-} // rocr
+} // namespace rocr
 
 #endif
 
diff --git a/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h b/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h
index c9f92bbea..1cdf84605 100644
--- a/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h
+++ b/src/image/addrlib/src/gfx11/gfx11SwizzlePattern.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -34,9 +17,10 @@
 #define __GFX11_SWIZZLE_PATTERN_H__
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
-
+namespace Addr
+{
+namespace V2
+{
 const ADDR_SW_PATINFO GFX11_SW_256_D_PATINFO[] =
 {
     {   1,    0,    0,    0,    0, } , // 1 pipes (1 PKRs) 1 bpe @ SW_256_D
@@ -3050,7 +3034,7 @@ const UINT_64 GFX11_HTILE_SW_PATTERN[][18] =
     {0,             0,             0,             X3,            Y3,            X7,            Y7,            X8,            Y8,            Y4^X9^Y9,      Z1^X4^Y4,      Z0^Y5^X8,      X5^Y8,         Y6^X7,         X6^Y7,         X9,            Y9,            X10,           }, //17
 };
 
-} // V2
+}// V2
 } // Addr
-} // rocr
+} // namespace rocr
 #endif
diff --git a/src/image/addrlib/src/gfx11/gfx11addrlib.cpp b/src/image/addrlib/src/gfx11/gfx11addrlib.cpp
index c56be1a57..bcaa539d4 100644
--- a/src/image/addrlib/src/gfx11/gfx11addrlib.cpp
+++ b/src/image/addrlib/src/gfx11/gfx11addrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -37,9 +20,9 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 /**
 ************************************************************************************************************************
 *   Gfx11HwlInit
@@ -289,18 +272,23 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeDccInfo(
     }
     else
     {
-        const UINT_32 elemLog2 = Log2(pIn->bpp >> 3);
-
-        const BOOL_32 isThick = IsThick(pIn->resourceType, pIn->swizzleMode);
-
-        pOut->compressBlkWidth  = isThick ? Block256_3d[elemLog2].w : Block256_2d[elemLog2].w;
-        pOut->compressBlkHeight = isThick ? Block256_3d[elemLog2].h : Block256_2d[elemLog2].h;
-        pOut->compressBlkDepth  = isThick ? Block256_3d[elemLog2].d : 1;
+        const UINT_32 elemLog2    = Log2(pIn->bpp >> 3);
+        const UINT_32 numFragLog2 = Log2(Max(pIn->numFrags, 1u));
+        Dim3d         compBlock   = {};
+
+        GetCompressedBlockSizeLog2(Gfx11DataColor,
+                                   pIn->resourceType,
+                                   pIn->swizzleMode,
+                                   elemLog2,
+                                   numFragLog2,
+                                   &compBlock);
+        pOut->compressBlkWidth  = 1 << compBlock.w;
+        pOut->compressBlkHeight = 1 << compBlock.h;
+        pOut->compressBlkDepth  = 1 << compBlock.d;
 
         if (ret == ADDR_OK)
         {
             Dim3d         metaBlk     = {};
-            const UINT_32 numFragLog2 = Log2(Max(pIn->numFrags, 1u));
             const UINT_32 metaBlkSize = GetMetaBlkSize(Gfx11DataColor,
                                                        pIn->resourceType,
                                                        pIn->swizzleMode,
@@ -386,6 +374,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeDccInfo(
 
             // Get the DCC address equation (copied from DccAddrFromCoord)
             const UINT_32 elemLog2    = Log2(pIn->bpp >> 3);
+            const UINT_32 numPipeLog2 = m_pipesLog2;
             UINT_32       index       = m_dccBaseIndex + elemLog2;
             const UINT_8* patIdxTable = (pIn->swizzleMode == ADDR_SW_64KB_R_X) ?
                                         GFX11_DCC_64K_R_X_PATIDX : GFX11_DCC_256K_R_X_PATIDX;
@@ -739,27 +728,26 @@ ChipFamily Gfx11Lib::HwlConvertChipFamily(
 
     switch (chipFamily)
     {
-        case FAMILY_GFX1100:
-            if (ASICREV_IS_GFX1100(chipRevision))
-            {
-            }
-            if (ASICREV_IS_GFX1101(chipRevision))
+        case FAMILY_NV3:
+            if (ASICREV_IS_NAVI31_P(chipRevision))
             {
             }
-            if (ASICREV_IS_GFX1102(chipRevision))
+            if (ASICREV_IS_NAVI32_P(chipRevision))
             {
             }
-            break;
-        case FAMILY_GFX1103:
-            if (ASICREV_IS_GFX1103(chipRevision))
+            if (ASICREV_IS_NAVI33_P(chipRevision))
             {
             }
             break;
         case FAMILY_GFX1150:
             if (ASICREV_IS_GFX1150(chipRevision))
             {
+                m_settings.isGfx1150 = 1;
             }
             break;
+        case FAMILY_GFX1103:
+            m_settings.isGfx1103 = 1;
+            break;
         default:
             ADDR_ASSERT(!"Unknown chip family");
             break;
@@ -1103,13 +1091,14 @@ VOID Gfx11Lib::ConvertSwizzlePatternToEquation(
     ADDR_EQUATION*         pEquation) ///< [out] equation converted from swizzle pattern
     const
 {
-    ADDR_BIT_SETTING fullSwizzlePattern[20];
+    ADDR_BIT_SETTING fullSwizzlePattern[ADDR_MAX_EQUATION_BIT];
     GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
 
     const ADDR_BIT_SETTING* pSwizzle      = fullSwizzlePattern;
     const UINT_32           blockSizeLog2 = GetBlockSizeLog2(swMode);
-
+    memset(pEquation, 0, sizeof(ADDR_EQUATION));
     pEquation->numBits            = blockSizeLog2;
+    pEquation->numBitComponents   = pPatInfo->maxItemCount;
     pEquation->stackedDepthSlices = FALSE;
 
     for (UINT_32 i = 0; i < elemLog2; i++)
@@ -1658,20 +1647,21 @@ VOID Gfx11Lib::InitEquationTable()
                 {
                     ADDR_ASSERT(IsValidSwMode(swMode));
 
-                    if (pPatInfo->maxItemCount <= 3)
+                    if (pPatInfo->maxItemCount <= 3) // Get a valid equationIndex
                     {
                         ADDR_EQUATION equation = {};
 
+                        // Passing in pPatInfo to get the addr equation
                         ConvertSwizzlePatternToEquation(elemLog2, rsrcType, swMode, pPatInfo, &equation);
 
                         equationIndex = m_numEquations;
                         ADDR_ASSERT(equationIndex < EquationTableSize);
-
+                        // Updates m_equationTable[m_numEquations] to be the addr equation for this PatInfo
                         m_equationTable[equationIndex] = equation;
-
+                        // Increment m_numEquations
                         m_numEquations++;
                     }
-                    else
+                    else // There is no equationIndex
                     {
                         // We only see "ill" equation from 64/128 BPE + 3D resource + SW_64KB_D_X
                         ADDR_ASSERT((elemLog2 == 3) || (elemLog2 == 4));
@@ -1744,7 +1734,19 @@ UINT_32 Gfx11Lib::GetValidDisplaySwizzleModes(
 
     if (bpp <= 64)
     {
+        const ChipFamily  family = GetChipFamily();
+
         swModeMask = Dcn32SwModeMask;
+
+        if (false
+            || (m_settings.isGfx1103)
+            || (m_settings.isGfx1150)
+           )
+        {
+            // Not all GPUs support displaying with 256kB swizzle modes.
+            swModeMask &= ~((1u << ADDR_SW_256KB_D_X) |
+                            (1u << ADDR_SW_256KB_R_X));
+        }
     }
 
     return swModeMask;
@@ -1936,15 +1938,15 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView(
 {
     ADDR_E_RETURNCODE returnCode = ADDR_OK;
 
-    if (pIn->resourceType != ADDR_RSRC_TEX_2D)
+    if (IsThin(pIn->resourceType, pIn->swizzleMode) == FALSE)
     {
-        // Only 2D resource can have a NonBC view...
+        // Only thin swizzle mode can have a NonBC view...
         returnCode = ADDR_INVALIDPARAMS;
     }
-    else if ((pIn->format != ADDR_FMT_ASTC_8x8) &&
+    else if (((pIn->format < ADDR_FMT_ASTC_4x4) || (pIn->format > ADDR_FMT_ETC2_128BPP)) &&
              ((pIn->format < ADDR_FMT_BC1) || (pIn->format > ADDR_FMT_BC7)))
     {
-        // Only support BC1~BC7 or ASTC_8x8 for now...
+        // Only support BC1~BC7, ASTC, or ETC2 for now...
         returnCode = ADDR_NOTSUPPORTED;
     }
     else
@@ -1957,8 +1959,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView(
         infoIn.swizzleMode  = pIn->swizzleMode;
         infoIn.resourceType = pIn->resourceType;
         infoIn.bpp          = bpp;
-        infoIn.width        = PowTwoAlign(pIn->width, bcWidth) / bcWidth;
-        infoIn.height       = PowTwoAlign(pIn->height, bcHeight) / bcHeight;
+        infoIn.width        = RoundUpQuotient(pIn->width, bcWidth);
+        infoIn.height       = RoundUpQuotient(pIn->height, bcHeight);
         infoIn.numSlices    = pIn->numSlices;
         infoIn.numMipLevels = pIn->numMipLevels;
         infoIn.numSamples   = 1;
@@ -2010,8 +2012,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView(
             pOut->pipeBankXor = slicePbXorOut.pipeBankXor;
 
             const BOOL_32 inTail           = tiled && (pIn->mipId >= infoOut.firstMipIdInTail) ? TRUE : FALSE;
-            const UINT_32 requestMipWidth  = PowTwoAlign(Max(pIn->width >> pIn->mipId, 1u), bcWidth) / bcWidth;
-            const UINT_32 requestMipHeight = PowTwoAlign(Max(pIn->height >> pIn->mipId, 1u), bcHeight) / bcHeight;
+            const UINT_32 requestMipWidth  = RoundUpQuotient(Max(pIn->width >> pIn->mipId, 1u), bcWidth);
+            const UINT_32 requestMipHeight = RoundUpQuotient(Max(pIn->height >> pIn->mipId, 1u), bcHeight);
 
             if (inTail)
             {
@@ -2061,10 +2063,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeNonBlockCompressedView(
                 pOut->mipId        = 1;
                 pOut->numMipLevels = 2;
 
-                const UINT_32 upperMipWidth  =
-                    PowTwoAlign(Max(pIn->width >> (pIn->mipId - 1), 1u), bcWidth) / bcWidth;
-                const UINT_32 upperMipHeight =
-                    PowTwoAlign(Max(pIn->height >> (pIn->mipId - 1), 1u), bcHeight) / bcHeight;
+                const UINT_32 upperMipWidth  = RoundUpQuotient(Max(pIn->width >> (pIn->mipId - 1), 1u), bcWidth);
+                const UINT_32 upperMipHeight = RoundUpQuotient(Max(pIn->height >> (pIn->mipId - 1), 1u), bcHeight);
 
                 const BOOL_32 needToAvoidInTail =
                     tiled && (requestMipWidth <= infoOut.blockWidth / 2) && (requestMipHeight <= infoOut.blockHeight) ?
@@ -2226,6 +2226,7 @@ BOOL_32 Gfx11Lib::ValidateSwModeParams(
     const BOOL_32             thin3d      = flags.view3dAs2dArray;
     const BOOL_32             linear      = IsLinear(swizzle);
     const BOOL_32             blk256B     = IsBlock256b(swizzle);
+    const BOOL_32             isNonPrtXor = IsNonPrtXor(swizzle);
     const BOOL_32             prt         = flags.prt;
 
     // Misc check
@@ -2571,8 +2572,9 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                 pOut->resourceType   = pIn->resourceType;
                 pOut->validSwModeSet = allowedSwModeSet;
                 pOut->canXor         = (allowedSwModeSet.value & Gfx11XorSwModeMask) ? TRUE : FALSE;
-                pOut->validBlockSet  = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType);
-                pOut->validSwTypeSet = GetAllowedSwSet(allowedSwModeSet);
+
+                GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType, &(pOut->validBlockSet));
+                GetAllowedSwSet(allowedSwModeSet, &(pOut->validSwTypeSet));
 
                 pOut->clientPreferredSwSet = pIn->preferredSwSet;
 
@@ -2584,7 +2586,9 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                 // Apply optional restrictions
                 if (pIn->flags.needEquation)
                 {
-                    FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3));
+                    UINT_32 components = pIn->flags.allowExtEquation ?  ADDR_MAX_EQUATION_COMP :
+                                                                        ADDR_MAX_LEGACY_EQUATION_COMP;
+                    FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components);
                 }
 
                 if (allowedSwModeSet.value == Gfx11LinearSwModeMask)
@@ -2603,7 +2607,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                         allowedSwModeSet.swLinear = 0;
                     }
 
-                    ADDR2_BLOCK_SET allowedBlockSet = GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType);
+                    ADDR2_BLOCK_SET allowedBlockSet = {};
+                    GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType, &allowedBlockSet);
 
                     // Determine block size if there are 2 or more block type candidates
                     if (IsPow2(allowedBlockSet.value) == FALSE)
@@ -2632,6 +2637,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
 
                         const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
                         const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
+                        const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
                         UINT_32       minSizeBlk         = AddrBlockMicro;
                         UINT_64       minSize            = 0;
 
@@ -2639,7 +2645,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
 
                         for (UINT_32 i = AddrBlockLinear; i < AddrBlockMaxTiledType; i++)
                         {
-                            if (IsBlockTypeAvaiable(allowedBlockSet, static_cast<AddrBlockType>(i)))
+                            if (Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<rocr::AddrBlockType>(i)))
                             {
                                 localIn.swizzleMode = swMode[i];
 
@@ -2657,7 +2663,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                                     padSize[i] = localOut.surfSize;
 
                                     if ((minSize == 0) ||
-                                        BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
+                                        Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
                                     {
                                         minSize    = padSize[i];
                                         minSizeBlk = i;
@@ -2702,9 +2708,9 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                             for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
                             {
                                 if ((i != minSizeBlk) &&
-                                    IsBlockTypeAvaiable(allowedBlockSet, static_cast<AddrBlockType>(i)))
+                                    Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<rocr::AddrBlockType>(i)))
                                 {
-                                    if (BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
+                                    if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
                                     {
                                         // Clear the block type if the memory waste is unacceptable
                                         allowedBlockSet.value &= ~(1u << (i - 1));
@@ -2776,9 +2782,11 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                     }
 
                     // Block type should be determined.
-                    ADDR_ASSERT(IsPow2(GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType).value));
+                    GetAllowedBlockSet(allowedSwModeSet, pOut->resourceType, &allowedBlockSet);
+                    ADDR_ASSERT(IsPow2(allowedBlockSet.value));
 
-                    ADDR2_SWTYPE_SET allowedSwSet = GetAllowedSwSet(allowedSwModeSet);
+                    ADDR2_SWTYPE_SET allowedSwSet = {};
+                    GetAllowedSwSet(allowedSwModeSet, &allowedSwSet);
 
                     // Determine swizzle type if there are 2 or more swizzle type candidates
                     if ((allowedSwSet.value != 0) && (IsPow2(allowedSwSet.value) == FALSE))
@@ -2819,7 +2827,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                         {
                             if (pIn->flags.color && allowedSwSet.sw_R)
                             {
-                                allowedSwModeSet.value &= Gfx11DisplaySwModeMask;
+                                allowedSwModeSet.value &= Gfx11RenderSwModeMask;
                             }
                             else if (allowedSwSet.sw_S)
                             {
@@ -2827,7 +2835,7 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                             }
                             else if (allowedSwSet.sw_D)
                             {
-                                allowedSwModeSet.value &= Gfx11RenderSwModeMask;
+                                allowedSwModeSet.value &= Gfx11DisplaySwModeMask;
                             }
                             else
                             {
@@ -2856,7 +2864,8 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
                         }
 
                         // Swizzle type should be determined.
-                        ADDR_ASSERT(IsPow2(GetAllowedSwSet(allowedSwModeSet).value));
+                        GetAllowedSwSet(allowedSwModeSet, &allowedSwSet);
+                        ADDR_ASSERT(IsPow2(allowedSwSet.value));
                     }
 
                     // Determine swizzle mode now. Always select the "largest" swizzle mode for a given block type +
@@ -2883,6 +2892,271 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlGetPreferredSurfaceSetting(
     return returnCode;
 }
 
+/**
+************************************************************************************************************************
+*   Gfx11Lib::HwlGetPossibleSwizzleModes
+*
+*   @brief
+*       Returns a list of swizzle modes that are valid from the hardware's perspective for the client to choose from
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx11Lib::HwlGetPossibleSwizzleModes(
+    const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,  ///< [in] input structure
+    ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut  ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pIn->flags.fmask)
+    {
+        // There is no FMASK for GFX11 ASICs.
+        ADDR_ASSERT_ALWAYS();
+
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        UINT_32 bpp    = pIn->bpp;
+        UINT_32 width  = Max(pIn->width, 1u);
+        UINT_32 height = Max(pIn->height, 1u);
+
+        // Set format to INVALID will skip this conversion
+        if (pIn->format != ADDR_FMT_INVALID)
+        {
+            ElemMode elemMode = ADDR_UNCOMPRESSED;
+            UINT_32 expandX, expandY;
+
+            // Get compression/expansion factors and element mode which indicates compression/expansion
+            bpp = GetElemLib()->GetBitsPerPixel(pIn->format,
+                &elemMode,
+                &expandX,
+                &expandY);
+
+            UINT_32 basePitch = 0;
+            GetElemLib()->AdjustSurfaceInfo(elemMode,
+                expandX,
+                expandY,
+                &bpp,
+                &basePitch,
+                &width,
+                &height);
+        }
+
+        const UINT_32 numSlices    = Max(pIn->numSlices, 1u);
+        const UINT_32 numMipLevels = Max(pIn->numMipLevels, 1u);
+        const UINT_32 numSamples   = Max(pIn->numSamples, 1u);
+        const BOOL_32 msaa         = numSamples > 1;
+
+        // Pre sanity check on non swizzle mode parameters
+        ADDR2_COMPUTE_SURFACE_INFO_INPUT localIn = {};
+        localIn.flags = pIn->flags;
+        localIn.resourceType = pIn->resourceType;
+        localIn.format = pIn->format;
+        localIn.bpp = bpp;
+        localIn.width = width;
+        localIn.height = height;
+        localIn.numSlices = numSlices;
+        localIn.numMipLevels = numMipLevels;
+        localIn.numSamples = numSamples;
+        localIn.numFrags = numSamples;
+
+        if (ValidateNonSwModeParams(&localIn))
+        {
+            // Allow appropriate swizzle modes by default
+            ADDR2_SWMODE_SET allowedSwModeSet = {};
+            allowedSwModeSet.value |= Gfx11LinearSwModeMask | Gfx11Blk256BSwModeMask;
+            if (pIn->resourceType == ADDR_RSRC_TEX_3D)
+            {
+                allowedSwModeSet.value |= Gfx11Rsrc3dThick4KBSwModeMask  |
+                                          Gfx11Rsrc3dThin64KBSwModeMask  |
+                                          Gfx11Rsrc3dThick64KBSwModeMask |
+                                          Gfx11Rsrc3dThin256KBSwModeMask |
+                                          Gfx11Rsrc3dThick256KBSwModeMask;
+            }
+            else
+            {
+                allowedSwModeSet.value |= Gfx11Blk4KBSwModeMask | Gfx11Blk64KBSwModeMask | Gfx11Blk256KBSwModeMask;
+            }
+
+            // Filter out invalid swizzle mode(s) by image attributes and HW restrictions
+            switch (pIn->resourceType)
+            {
+            case ADDR_RSRC_TEX_1D:
+                allowedSwModeSet.value &= Gfx11Rsrc1dSwModeMask;
+                break;
+
+            case ADDR_RSRC_TEX_2D:
+                allowedSwModeSet.value &= pIn->flags.prt ? Gfx11Rsrc2dPrtSwModeMask : Gfx11Rsrc2dSwModeMask;
+                break;
+
+            case ADDR_RSRC_TEX_3D:
+                allowedSwModeSet.value &= pIn->flags.prt ? Gfx11Rsrc3dPrtSwModeMask : Gfx11Rsrc3dSwModeMask;
+
+                if (pIn->flags.view3dAs2dArray)
+                {
+                    allowedSwModeSet.value &= Gfx11Rsrc3dThinSwModeMask;
+                }
+                break;
+
+            default:
+                ADDR_ASSERT_ALWAYS();
+                allowedSwModeSet.value = 0;
+                break;
+            }
+
+            // TODO: figure out if following restrictions are correct on GFX11...
+            if (ElemLib::IsBlockCompressed(pIn->format) ||
+                ElemLib::IsMacroPixelPacked(pIn->format) ||
+                (bpp > 64) ||
+                (msaa && ((bpp > 32) || pIn->flags.color || pIn->flags.unordered)))
+            {
+                allowedSwModeSet.value &= ~Gfx11ZSwModeMask;
+            }
+
+            if (pIn->format == ADDR_FMT_32_32_32)
+            {
+                allowedSwModeSet.value &= Gfx11LinearSwModeMask;
+            }
+
+            if (msaa)
+            {
+                allowedSwModeSet.value &= Gfx11MsaaSwModeMask;
+            }
+
+            if (pIn->flags.depth || pIn->flags.stencil)
+            {
+                allowedSwModeSet.value &= Gfx11ZSwModeMask;
+            }
+
+            if (pIn->flags.display)
+            {
+                allowedSwModeSet.value &= GetValidDisplaySwizzleModes(bpp);
+            }
+
+            if (allowedSwModeSet.value != 0)
+            {
+#if DEBUG
+                // Post sanity check, at least AddrLib should accept the output generated by its own
+                UINT_32 validateSwModeSet = allowedSwModeSet.value;
+
+                for (UINT_32 i = 0; validateSwModeSet != 0; i++)
+                {
+                    if (validateSwModeSet & 1)
+                    {
+                        localIn.swizzleMode = static_cast<AddrSwizzleMode>(i);
+                        ADDR_ASSERT(ValidateSwModeParams(&localIn));
+                    }
+
+                    validateSwModeSet >>= 1;
+                }
+#endif
+
+                pOut->resourceType = pIn->resourceType;
+                pOut->clientPreferredSwSet = pIn->preferredSwSet;
+
+                if (pOut->clientPreferredSwSet.value == 0)
+                {
+                    pOut->clientPreferredSwSet.value = AddrSwSetAll;
+                }
+
+                if (pIn->flags.needEquation)
+                {
+                    UINT_32 components = pIn->flags.allowExtEquation ?  ADDR_MAX_EQUATION_COMP :
+                                                                        ADDR_MAX_LEGACY_EQUATION_COMP;
+                    FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components);
+                }
+
+                pOut->validSwModeSet = allowedSwModeSet;
+                pOut->canXor = (allowedSwModeSet.value & Gfx11XorSwModeMask) ? TRUE : FALSE;
+            }
+            else
+            {
+                // Invalid combination...
+                ADDR_ASSERT_ALWAYS();
+                returnCode = ADDR_INVALIDPARAMS;
+            }
+        }
+        else
+        {
+            // Invalid combination...
+            ADDR_ASSERT_ALWAYS();
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx11Lib::HwlGetAllowedBlockSet
+*
+*   @brief
+*       Returns the set of allowed block sizes given the allowed swizzle modes and resource type
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx11Lib::HwlGetAllowedBlockSet(
+    ADDR2_SWMODE_SET allowedSwModeSet,  ///< [in] allowed swizzle modes
+    AddrResourceType rsrcType,          ///< [in] resource type
+    ADDR2_BLOCK_SET* pAllowedBlockSet   ///< [out] allowed block sizes
+    ) const
+{
+    ADDR2_BLOCK_SET allowedBlockSet = {};
+
+    allowedBlockSet.micro  = (allowedSwModeSet.value & Gfx11Blk256BSwModeMask) ? TRUE : FALSE;
+    allowedBlockSet.linear = (allowedSwModeSet.value & Gfx11LinearSwModeMask)  ? TRUE : FALSE;
+
+    if (rsrcType == ADDR_RSRC_TEX_3D)
+    {
+        allowedBlockSet.macroThick4KB    = (allowedSwModeSet.value & Gfx11Rsrc3dThick4KBSwModeMask)   ? TRUE : FALSE;
+        allowedBlockSet.macroThin64KB    = (allowedSwModeSet.value & Gfx11Rsrc3dThin64KBSwModeMask)   ? TRUE : FALSE;
+        allowedBlockSet.macroThick64KB   = (allowedSwModeSet.value & Gfx11Rsrc3dThick64KBSwModeMask)  ? TRUE : FALSE;
+        allowedBlockSet.gfx11.thin256KB  = (allowedSwModeSet.value & Gfx11Rsrc3dThin256KBSwModeMask)  ? TRUE : FALSE;
+        allowedBlockSet.gfx11.thick256KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick256KBSwModeMask) ? TRUE : FALSE;
+    }
+    else
+    {
+        allowedBlockSet.macroThin4KB    = (allowedSwModeSet.value & Gfx11Blk4KBSwModeMask)   ? TRUE : FALSE;
+        allowedBlockSet.macroThin64KB   = (allowedSwModeSet.value & Gfx11Blk64KBSwModeMask)  ? TRUE : FALSE;
+        allowedBlockSet.gfx11.thin256KB = (allowedSwModeSet.value & Gfx11Blk256KBSwModeMask) ? TRUE : FALSE;
+    }
+
+    *pAllowedBlockSet = allowedBlockSet;
+    return ADDR_OK;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx11Lib::HwlGetAllowedSwSet
+*
+*   @brief
+*       Returns the set of allowed swizzle types given the allowed swizzle modes
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx11Lib::HwlGetAllowedSwSet(
+    ADDR2_SWMODE_SET  allowedSwModeSet, ///< [in] allowed swizzle modes
+    ADDR2_SWTYPE_SET* pAllowedSwSet     ///< [out] allowed swizzle types
+    ) const
+{
+    ADDR2_SWTYPE_SET allowedSwSet = {};
+
+    allowedSwSet.sw_Z = (allowedSwModeSet.value & Gfx11ZSwModeMask)        ? TRUE : FALSE;
+    allowedSwSet.sw_S = (allowedSwModeSet.value & Gfx11StandardSwModeMask) ? TRUE : FALSE;
+    allowedSwSet.sw_D = (allowedSwModeSet.value & Gfx11DisplaySwModeMask)  ? TRUE : FALSE;
+    allowedSwSet.sw_R = (allowedSwModeSet.value & Gfx11RenderSwModeMask)   ? TRUE : FALSE;
+
+    *pAllowedSwSet = allowedSwSet;
+    return ADDR_OK;
+}
+
 /**
 ************************************************************************************************************************
 *   Gfx11Lib::ComputeStereoInfo
@@ -3182,12 +3456,12 @@ ADDR_E_RETURNCODE Gfx11Lib::ComputeSurfaceInfoMacroTiled(
                 UINT_64       mipSize[MaxMipLevels];
                 UINT_64       mipSliceSize[MaxMipLevels];
 
+                // For htile, we need to make z16 and stencil enter the mip tail at the same time as z32 would
                 Dim3d fixedTailMaxDim = tailMaxDim;
-
-                if ((IsZOrderSwizzle(pIn->swizzleMode) || IsRtOptSwizzle(pIn->swizzleMode)) && (index <= 1))
+                if (IsZOrderSwizzle(pIn->swizzleMode) && (index <= 1))
                 {
                     fixedTailMaxDim.w /= Block256_2d[index].w / Block256_2d[2].w;
-                    fixedTailMaxDim.h /= Block256_2d[index].h / Block256_2d[2].h;
+                    fixedTailMaxDim.h /= Block256_2d[index].w / Block256_2d[2].w;
                 }
 
                 for (UINT_32 i = 0; i < pIn->numMipLevels; i++)
@@ -3400,54 +3674,23 @@ UINT_32 Gfx11Lib::ComputeOffsetFromEquation(
     {
         UINT_32 v = 0;
 
-        if (pEq->addr[i].valid)
-        {
-            if (pEq->addr[i].channel == 0)
-            {
-                v ^= (x >> pEq->addr[i].index) & 1;
-            }
-            else if (pEq->addr[i].channel == 1)
-            {
-                v ^= (y >> pEq->addr[i].index) & 1;
-            }
-            else
-            {
-                ADDR_ASSERT(pEq->addr[i].channel == 2);
-                v ^= (z >> pEq->addr[i].index) & 1;
-            }
-        }
-
-        if (pEq->xor1[i].valid)
-        {
-            if (pEq->xor1[i].channel == 0)
-            {
-                v ^= (x >> pEq->xor1[i].index) & 1;
-            }
-            else if (pEq->xor1[i].channel == 1)
-            {
-                v ^= (y >> pEq->xor1[i].index) & 1;
-            }
-            else
-            {
-                ADDR_ASSERT(pEq->xor1[i].channel == 2);
-                v ^= (z >> pEq->xor1[i].index) & 1;
-            }
-        }
-
-        if (pEq->xor2[i].valid)
+        for (UINT_32 c = 0; c < pEq->numBitComponents; c++)
         {
-            if (pEq->xor2[i].channel == 0)
-            {
-                v ^= (x >> pEq->xor2[i].index) & 1;
-            }
-            else if (pEq->xor2[i].channel == 1)
-            {
-                v ^= (y >> pEq->xor2[i].index) & 1;
-            }
-            else
+            if (pEq->comps[c][i].valid)
             {
-                ADDR_ASSERT(pEq->xor2[i].channel == 2);
-                v ^= (z >> pEq->xor2[i].index) & 1;
+                if (pEq->comps[c][i].channel == 0)
+                {
+                    v ^= (x >> pEq->comps[c][i].index) & 1;
+                }
+                else if (pEq->comps[c][i].channel == 1)
+                {
+                    v ^= (y >> pEq->comps[c][i].index) & 1;
+                }
+                else
+                {
+                    ADDR_ASSERT(pEq->comps[c][i].channel == 2);
+                    v ^= (z >> pEq->comps[c][i].index) & 1;
+                }
             }
         }
 
@@ -4033,6 +4276,7 @@ UINT_32 Gfx11Lib::HwlComputeMaxMetaBaseAlignments() const
     }
 
     // Max base alignment for 2D Dcc
+    // swizzle mode support DCC...
     const AddrSwizzleMode ValidSwizzleModeForDcc2D[] =
     {
         ADDR_SW_64KB_R_X,
@@ -4250,4 +4494,4 @@ ADDR_E_RETURNCODE Gfx11Lib::HwlComputeSurfaceInfoLinear(
 
 } // V2
 } // Addr
-} // rocr
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/gfx11/gfx11addrlib.h b/src/image/addrlib/src/gfx11/gfx11addrlib.h
index 391eda2cb..78ffc46b4 100644
--- a/src/image/addrlib/src/gfx11/gfx11addrlib.h
+++ b/src/image/addrlib/src/gfx11/gfx11addrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -38,8 +21,10 @@
 #include "gfx11SwizzlePattern.h"
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 
 /**
 ************************************************************************************************************************
@@ -50,7 +35,9 @@ struct Gfx11ChipSettings
 {
     struct
     {
-        UINT_32 reserved1           : 32;
+        UINT_32 isGfx1150           :  1;
+        UINT_32 isGfx1103           :  1;
+        UINT_32 reserved1           : 30;
 
         // Misc configuration bits
         UINT_32 reserved2           : 32;
@@ -285,6 +272,19 @@ class Gfx11Lib : public Lib
         const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
         ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut) const;
 
+    virtual ADDR_E_RETURNCODE HwlGetPossibleSwizzleModes(
+        const ADDR2_GET_PREFERRED_SURF_SETTING_INPUT* pIn,
+        ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT*      pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlGetAllowedBlockSet(
+        ADDR2_SWMODE_SET allowedSwModeSet,
+        AddrResourceType rsrcType,
+        ADDR2_BLOCK_SET* pAllowedBlockSet) const;
+
+    virtual ADDR_E_RETURNCODE HwlGetAllowedSwSet(
+        ADDR2_SWMODE_SET  allowedSwModeSet,
+        ADDR2_SWTYPE_SET* pAllowedSwSet) const;
+
     virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfoSanityCheck(
         const ADDR2_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
 
@@ -467,43 +467,6 @@ class Gfx11Lib : public Lib
 
     UINT_32 GetMaxNumMipsInTail(UINT_32 blockSizeLog2, BOOL_32 isThin) const;
 
-    static ADDR2_BLOCK_SET GetAllowedBlockSet(ADDR2_SWMODE_SET allowedSwModeSet, AddrResourceType rsrcType)
-    {
-        ADDR2_BLOCK_SET allowedBlockSet = {};
-
-        allowedBlockSet.micro  = (allowedSwModeSet.value & Gfx11Blk256BSwModeMask) ? TRUE : FALSE;
-        allowedBlockSet.linear = (allowedSwModeSet.value & Gfx11LinearSwModeMask)  ? TRUE : FALSE;
-
-        if (rsrcType == ADDR_RSRC_TEX_3D)
-        {
-            allowedBlockSet.macroThick4KB    = (allowedSwModeSet.value & Gfx11Rsrc3dThick4KBSwModeMask)   ? TRUE : FALSE;
-            allowedBlockSet.macroThin64KB    = (allowedSwModeSet.value & Gfx11Rsrc3dThin64KBSwModeMask)   ? TRUE : FALSE;
-            allowedBlockSet.macroThick64KB   = (allowedSwModeSet.value & Gfx11Rsrc3dThick64KBSwModeMask)  ? TRUE : FALSE;
-            allowedBlockSet.gfx11.thin256KB  = (allowedSwModeSet.value & Gfx11Rsrc3dThin256KBSwModeMask)  ? TRUE : FALSE;
-            allowedBlockSet.gfx11.thick256KB = (allowedSwModeSet.value & Gfx11Rsrc3dThick256KBSwModeMask) ? TRUE : FALSE;
-        }
-        else
-        {
-            allowedBlockSet.macroThin4KB    = (allowedSwModeSet.value & Gfx11Blk4KBSwModeMask)   ? TRUE : FALSE;
-            allowedBlockSet.macroThin64KB   = (allowedSwModeSet.value & Gfx11Blk64KBSwModeMask)  ? TRUE : FALSE;
-            allowedBlockSet.gfx11.thin256KB = (allowedSwModeSet.value & Gfx11Blk256KBSwModeMask) ? TRUE : FALSE;
-        }
-
-        return allowedBlockSet;
-    }
-
-    static ADDR2_SWTYPE_SET GetAllowedSwSet(ADDR2_SWMODE_SET allowedSwModeSet)
-    {
-        ADDR2_SWTYPE_SET allowedSwSet = {};
-
-        allowedSwSet.sw_Z = (allowedSwModeSet.value & Gfx11ZSwModeMask)        ? TRUE : FALSE;
-        allowedSwSet.sw_S = (allowedSwModeSet.value & Gfx11StandardSwModeMask) ? TRUE : FALSE;
-        allowedSwSet.sw_D = (allowedSwModeSet.value & Gfx11DisplaySwModeMask)  ? TRUE : FALSE;
-        allowedSwSet.sw_R = (allowedSwModeSet.value & Gfx11RenderSwModeMask)   ? TRUE : FALSE;
-
-        return allowedSwSet;
-    }
-
     BOOL_32 IsInMipTail(
         Dim3d   mipTailDim,
         UINT_32 maxNumMipsInTail,
@@ -555,5 +518,6 @@ class Gfx11Lib : public Lib
 
 } // V2
 } // Addr
-} // rocr
+} // namespace rocr
 #endif
+
diff --git a/src/image/addrlib/src/gfx12/gfx12SwizzlePattern.h b/src/image/addrlib/src/gfx12/gfx12SwizzlePattern.h
new file mode 100644
index 000000000..55508066d
--- /dev/null
+++ b/src/image/addrlib/src/gfx12/gfx12SwizzlePattern.h
@@ -0,0 +1,280 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2023 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+
+/**
+************************************************************************************************************************
+* @file  gfx12SwizzlePattern.h
+* @brief swizzle pattern for gfx12.
+************************************************************************************************************************
+*/
+
+#ifndef __GFX12_SWIZZLE_PATTERN_H__
+#define __GFX12_SWIZZLE_PATTERN_H__
+
+namespace rocr {
+namespace Addr
+{
+namespace V3
+{
+    const ADDR_SW_PATINFO GFX12_SW_256B_2D_1xAA_PATINFO[] =
+    {
+        {   0,    0,    0,    0, } , // 1 BPE @ SW_256B_2D_1xAA
+        {   1,    0,    0,    0, } , // 2 BPE @ SW_256B_2D_1xAA
+        {   2,    0,    0,    0, } , // 4 BPE @ SW_256B_2D_1xAA
+        {   3,    0,    0,    0, } , // 8 BPE @ SW_256B_2D_1xAA
+        {   4,    0,    0,    0, } , // 16 BPE @ SW_256B_2D_1xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256B_2D_2xAA_PATINFO[] =
+    {
+        {   5,    0,    0,    0, } , // 1 BPE @ SW_256B_2D_2xAA
+        {   6,    0,    0,    0, } , // 2 BPE @ SW_256B_2D_2xAA
+        {   7,    0,    0,    0, } , // 4 BPE @ SW_256B_2D_2xAA
+        {   8,    0,    0,    0, } , // 8 BPE @ SW_256B_2D_2xAA
+        {   9,    0,    0,    0, } , // 16 BPE @ SW_256B_2D_2xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256B_2D_4xAA_PATINFO[] =
+    {
+        {  10,    0,    0,    0, } , // 1 BPE @ SW_256B_2D_4xAA
+        {  11,    0,    0,    0, } , // 2 BPE @ SW_256B_2D_4xAA
+        {  12,    0,    0,    0, } , // 4 BPE @ SW_256B_2D_4xAA
+        {  13,    0,    0,    0, } , // 8 BPE @ SW_256B_2D_4xAA
+        {  14,    0,    0,    0, } , // 16 BPE @ SW_256B_2D_4xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256B_2D_8xAA_PATINFO[] =
+    {
+        {  15,    0,    0,    0, } , // 1 BPE @ SW_256B_2D_8xAA
+        {  16,    0,    0,    0, } , // 2 BPE @ SW_256B_2D_8xAA
+        {  17,    0,    0,    0, } , // 4 BPE @ SW_256B_2D_8xAA
+        {  18,    0,    0,    0, } , // 8 BPE @ SW_256B_2D_8xAA
+        {  19,    0,    0,    0, } , // 16 BPE @ SW_256B_2D_8xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_4KB_2D_1xAA_PATINFO[] =
+    {
+        {   0,    1,    0,    0, } , // 1 BPE @ SW_4KB_2D_1xAA
+        {   1,    2,    0,    0, } , // 2 BPE @ SW_4KB_2D_1xAA
+        {   2,    3,    0,    0, } , // 4 BPE @ SW_4KB_2D_1xAA
+        {   3,    4,    0,    0, } , // 8 BPE @ SW_4KB_2D_1xAA
+        {   4,    5,    0,    0, } , // 16 BPE @ SW_4KB_2D_1xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_4KB_2D_2xAA_PATINFO[] =
+    {
+        {   5,    2,    0,    0, } , // 1 BPE @ SW_4KB_2D_2xAA
+        {   6,    3,    0,    0, } , // 2 BPE @ SW_4KB_2D_2xAA
+        {   7,    4,    0,    0, } , // 4 BPE @ SW_4KB_2D_2xAA
+        {   8,    5,    0,    0, } , // 8 BPE @ SW_4KB_2D_2xAA
+        {   9,    6,    0,    0, } , // 16 BPE @ SW_4KB_2D_2xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_4KB_2D_4xAA_PATINFO[] =
+    {
+        {  10,    3,    0,    0, } , // 1 BPE @ SW_4KB_2D_4xAA
+        {  11,    4,    0,    0, } , // 2 BPE @ SW_4KB_2D_4xAA
+        {  12,    5,    0,    0, } , // 4 BPE @ SW_4KB_2D_4xAA
+        {  13,    6,    0,    0, } , // 8 BPE @ SW_4KB_2D_4xAA
+        {  14,    7,    0,    0, } , // 16 BPE @ SW_4KB_2D_4xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_4KB_2D_8xAA_PATINFO[] =
+    {
+        {  15,    4,    0,    0, } , // 1 BPE @ SW_4KB_2D_8xAA
+        {  16,    5,    0,    0, } , // 2 BPE @ SW_4KB_2D_8xAA
+        {  17,    6,    0,    0, } , // 4 BPE @ SW_4KB_2D_8xAA
+        {  18,    7,    0,    0, } , // 8 BPE @ SW_4KB_2D_8xAA
+        {  19,    8,    0,    0, } , // 16 BPE @ SW_4KB_2D_8xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_64KB_2D_1xAA_PATINFO[] =
+    {
+        {   0,    1,    1,    0, } , // 1 BPE @ SW_64KB_2D_1xAA
+        {   1,    2,    2,    0, } , // 2 BPE @ SW_64KB_2D_1xAA
+        {   2,    3,    3,    0, } , // 4 BPE @ SW_64KB_2D_1xAA
+        {   3,    4,    4,    0, } , // 8 BPE @ SW_64KB_2D_1xAA
+        {   4,    5,    5,    0, } , // 16 BPE @ SW_64KB_2D_1xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_64KB_2D_2xAA_PATINFO[] =
+    {
+        {   5,    2,    2,    0, } , // 1 BPE @ SW_64KB_2D_2xAA
+        {   6,    3,    3,    0, } , // 2 BPE @ SW_64KB_2D_2xAA
+        {   7,    4,    4,    0, } , // 4 BPE @ SW_64KB_2D_2xAA
+        {   8,    5,    5,    0, } , // 8 BPE @ SW_64KB_2D_2xAA
+        {   9,    6,    6,    0, } , // 16 BPE @ SW_64KB_2D_2xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_64KB_2D_4xAA_PATINFO[] =
+    {
+        {  10,    3,    3,    0, } , // 1 BPE @ SW_64KB_2D_4xAA
+        {  11,    4,    4,    0, } , // 2 BPE @ SW_64KB_2D_4xAA
+        {  12,    5,    5,    0, } , // 4 BPE @ SW_64KB_2D_4xAA
+        {  13,    6,    6,    0, } , // 8 BPE @ SW_64KB_2D_4xAA
+        {  14,    7,    7,    0, } , // 16 BPE @ SW_64KB_2D_4xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_64KB_2D_8xAA_PATINFO[] =
+    {
+        {  15,    4,    4,    0, } , // 1 BPE @ SW_64KB_2D_8xAA
+        {  16,    5,    5,    0, } , // 2 BPE @ SW_64KB_2D_8xAA
+        {  17,    6,    6,    0, } , // 4 BPE @ SW_64KB_2D_8xAA
+        {  18,    7,    7,    0, } , // 8 BPE @ SW_64KB_2D_8xAA
+        {  19,    8,    8,    0, } , // 16 BPE @ SW_64KB_2D_8xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256KB_2D_1xAA_PATINFO[] =
+    {
+        {   0,    1,    1,    1, } , // 1 BPE @ SW_256KB_2D_1xAA
+        {   1,    2,    2,    2, } , // 2 BPE @ SW_256KB_2D_1xAA
+        {   2,    3,    3,    3, } , // 4 BPE @ SW_256KB_2D_1xAA
+        {   3,    4,    4,    4, } , // 8 BPE @ SW_256KB_2D_1xAA
+        {   4,    5,    5,    5, } , // 16 BPE @ SW_256KB_2D_1xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256KB_2D_2xAA_PATINFO[] =
+    {
+        {   5,    2,    2,    2, } , // 1 BPE @ SW_256KB_2D_2xAA
+        {   6,    3,    3,    3, } , // 2 BPE @ SW_256KB_2D_2xAA
+        {   7,    4,    4,    4, } , // 4 BPE @ SW_256KB_2D_2xAA
+        {   8,    5,    5,    5, } , // 8 BPE @ SW_256KB_2D_2xAA
+        {   9,    6,    6,    6, } , // 16 BPE @ SW_256KB_2D_2xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256KB_2D_4xAA_PATINFO[] =
+    {
+        {  10,    3,    3,    3, } , // 1 BPE @ SW_256KB_2D_4xAA
+        {  11,    4,    4,    4, } , // 2 BPE @ SW_256KB_2D_4xAA
+        {  12,    5,    5,    5, } , // 4 BPE @ SW_256KB_2D_4xAA
+        {  13,    6,    6,    6, } , // 8 BPE @ SW_256KB_2D_4xAA
+        {  14,    7,    7,    7, } , // 16 BPE @ SW_256KB_2D_4xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256KB_2D_8xAA_PATINFO[] =
+    {
+        {  15,    4,    4,    4, } , // 1 BPE @ SW_256KB_2D_8xAA
+        {  16,    5,    5,    5, } , // 2 BPE @ SW_256KB_2D_8xAA
+        {  17,    6,    6,    6, } , // 4 BPE @ SW_256KB_2D_8xAA
+        {  18,    7,    7,    7, } , // 8 BPE @ SW_256KB_2D_8xAA
+        {  19,    8,    8,    8, } , // 16 BPE @ SW_256KB_2D_8xAA
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_4KB_3D_PATINFO[] =
+    {
+        {  20,    9,    0,    0, } , // 1 BPE @ SW_4KB_3D
+        {  21,   10,    0,    0, } , // 2 BPE @ SW_4KB_3D
+        {  22,   11,    0,    0, } , // 4 BPE @ SW_4KB_3D
+        {  23,   12,    0,    0, } , // 8 BPE @ SW_4KB_3D
+        {  24,   13,    0,    0, } , // 16 BPE @ SW_4KB_3D
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_64KB_3D_PATINFO[] =
+    {
+        {  20,    9,    9,    0, } , // 1 BPE @ SW_64KB_3D
+        {  21,   10,   10,    0, } , // 2 BPE @ SW_64KB_3D
+        {  22,   11,   11,    0, } , // 4 BPE @ SW_64KB_3D
+        {  23,   12,   12,    0, } , // 8 BPE @ SW_64KB_3D
+        {  24,   13,   13,    0, } , // 16 BPE @ SW_64KB_3D
+    };
+
+    const ADDR_SW_PATINFO GFX12_SW_256KB_3D_PATINFO[] =
+    {
+        {  20,    9,    9,    9, } , // 1 BPE @ SW_256KB_3D
+        {  21,   10,   10,    9, } , // 2 BPE @ SW_256KB_3D
+        {  22,   11,   11,   10, } , // 4 BPE @ SW_256KB_3D
+        {  23,   12,   12,   11, } , // 8 BPE @ SW_256KB_3D
+        {  24,   13,   13,   11, } , // 16 BPE @ SW_256KB_3D
+    };
+
+
+    const UINT_64 GFX12_SW_PATTERN_NIBBLE1[][8] =
+    {
+        {X0,            X1,            Y0,            X2,            Y1,            Y2,            X3,            Y3,            }, // 0
+        {0,             X0,            Y0,            X1,            Y1,            X2,            Y2,            X3,            }, // 1
+        {0,             0,             X0,            Y0,            X1,            Y1,            X2,            Y2,            }, // 2
+        {0,             0,             0,             X0,            Y0,            X1,            X2,            Y1,            }, // 3
+        {0,             0,             0,             0,             X0,            Y0,            X1,            Y1,            }, // 4
+        {S0,            X0,            Y0,            X1,            Y1,            X2,            Y2,            X3,            }, // 5
+        {0,             S0,            X0,            Y0,            X1,            Y1,            X2,            Y2,            }, // 6
+        {0,             0,             S0,            X0,            Y0,            X1,            Y1,            X2,            }, // 7
+        {0,             0,             0,             S0,            X0,            Y0,            X1,            Y1,            }, // 8
+        {0,             0,             0,             0,             S0,            X0,            Y0,            X1,            }, // 9
+        {S0,            S1,            X0,            Y0,            X1,            Y1,            X2,            Y2,            }, // 10
+        {0,             S0,            S1,            X0,            Y0,            X1,            Y1,            X2,            }, // 11
+        {0,             0,             S0,            S1,            X0,            Y0,            X1,            Y1,            }, // 12
+        {0,             0,             0,             S0,            S1,            X0,            Y0,            X1,            }, // 13
+        {0,             0,             0,             0,             S0,            S1,            X0,            Y0,            }, // 14
+        {S0,            S1,            S2,            X0,            Y0,            X1,            Y1,            X2,            }, // 15
+        {0,             S0,            S1,            S2,            X0,            Y0,            X1,            Y1,            }, // 16
+        {0,             0,             S0,            S1,            S2,            X0,            Y0,            X1,            }, // 17
+        {0,             0,             0,             S0,            S1,            S2,            X0,            Y0,            }, // 18
+        {0,             0,             0,             0,             S0,            S1,            S2,            X0,            }, // 19
+        {X0,            X1,            Z0,            Y0,            Y1,            Z1,            X2,            Z2,            }, // 20
+        {0,             X0,            Z0,            Y0,            X1,            Z1,            Y1,            Z2,            }, // 21
+        {0,             0,             X0,            Y0,            X1,            Z0,            Y1,            Z1,            }, // 22
+        {0,             0,             0,             X0,            Y0,            Z0,            X1,            Z1,            }, // 23
+        {0,             0,             0,             0,             X0,            Z0,            Y0,            Z1,            }, // 24
+    };
+
+    const UINT_64 GFX12_SW_PATTERN_NIBBLE2[][4] =
+    {
+        {0,             0,             0,             0,             }, // 0
+        {Y4,            X4,            Y5,            X5,            }, // 1
+        {Y3,            X4,            Y4,            X5,            }, // 2
+        {Y3,            X3,            Y4,            X4,            }, // 3
+        {Y2,            X3,            Y3,            X4,            }, // 4
+        {Y2,            X2,            Y3,            X3,            }, // 5
+        {Y1,            X2,            Y2,            X3,            }, // 6
+        {Y1,            X1,            Y2,            X2,            }, // 7
+        {Y0,            X1,            Y1,            X2,            }, // 8
+        {Y2,            X3,            Z3,            Y3,            }, // 9
+        {Y2,            X2,            Z3,            Y3,            }, // 10
+        {Y2,            X2,            Z2,            Y3,            }, // 11
+        {Y1,            X2,            Z2,            Y2,            }, // 12
+        {Y1,            X1,            Z2,            Y2,            }, // 13
+    };
+
+    const UINT_64 GFX12_SW_PATTERN_NIBBLE3[][4] =
+    {
+        {0,             0,             0,             0,             }, // 0
+        {Y6,            X6,            Y7,            X7,            }, // 1
+        {Y5,            X6,            Y6,            X7,            }, // 2
+        {Y5,            X5,            Y6,            X6,            }, // 3
+        {Y4,            X5,            Y5,            X6,            }, // 4
+        {Y4,            X4,            Y5,            X5,            }, // 5
+        {Y3,            X4,            Y4,            X5,            }, // 6
+        {Y3,            X3,            Y4,            X4,            }, // 7
+        {Y2,            X3,            Y3,            X4,            }, // 8
+        {X4,            Z4,            Y4,            X5,            }, // 9
+        {X3,            Z4,            Y4,            X4,            }, // 10
+        {X3,            Z3,            Y4,            X4,            }, // 11
+        {X3,            Z3,            Y3,            X4,            }, // 12
+        {X2,            Z3,            Y3,            X3,            }, // 13
+    };
+
+    const UINT_64 GFX12_SW_PATTERN_NIBBLE4[][2] =
+    {
+        {0,             0,             }, // 0
+        {Y8,            X8,            }, // 1
+        {Y7,            X8,            }, // 2
+        {Y7,            X7,            }, // 3
+        {Y6,            X7,            }, // 4
+        {Y6,            X6,            }, // 5
+        {Y5,            X6,            }, // 6
+        {Y5,            X5,            }, // 7
+        {Y4,            X5,            }, // 8
+        {Z5,            Y5,            }, // 9
+        {Z4,            Y5,            }, // 10
+        {Z4,            Y4,            }, // 11
+    };
+
+} // V3
+} // Addr
+} // namespace
+#endif
diff --git a/src/image/addrlib/src/gfx12/gfx12addrlib.cpp b/src/image/addrlib/src/gfx12/gfx12addrlib.cpp
new file mode 100644
index 000000000..1b2e5e563
--- /dev/null
+++ b/src/image/addrlib/src/gfx12/gfx12addrlib.cpp
@@ -0,0 +1,1313 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2023 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+
+/**
+************************************************************************************************************************
+* @file  gfx12addrlib.cpp
+* @brief Contain the implementation for the Gfx12Lib class.
+************************************************************************************************************************
+*/
+
+#include "gfx12addrlib.h"
+#include "gfx12_gb_reg.h"
+
+#include "amdgpu_asic_addr.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace rocr {
+namespace Addr
+{
+/**
+************************************************************************************************************************
+*   Gfx12HwlInit
+*
+*   @brief
+*       Creates an Gfx12Lib object.
+*
+*   @return
+*       Returns an Gfx12Lib object pointer.
+************************************************************************************************************************
+*/
+Addr::Lib* Gfx12HwlInit(
+    const Client* pClient)
+{
+    return V3::Gfx12Lib::CreateObj(pClient);
+}
+
+namespace V3
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Static Const Member
+////////////////////////////////////////////////////////////////////////////////////////////////////
+const SwizzleModeFlags Gfx12Lib::SwizzleModeTable[ADDR3_MAX_TYPE] =
+{//Linear 2d   3d  256B  4KB  64KB  256KB  Reserved
+    {{1,   0,   0,    0,   0,    0,     0,    0}}, // ADDR3_LINEAR
+    {{0,   1,   0,    1,   0,    0,     0,    0}}, // ADDR3_256B_2D
+    {{0,   1,   0,    0,   1,    0,     0,    0}}, // ADDR3_4KB_2D
+    {{0,   1,   0,    0,   0,    1,     0,    0}}, // ADDR3_64KB_2D
+    {{0,   1,   0,    0,   0,    0,     1,    0}}, // ADDR3_256KB_2D
+    {{0,   0,   1,    0,   1,    0,     0,    0}}, // ADDR3_4KB_3D
+    {{0,   0,   1,    0,   0,    1,     0,    0}}, // ADDR3_64KB_3D
+    {{0,   0,   1,    0,   0,    0,     1,    0}}, // ADDR3_256KB_3D
+};
+
+const ADDR_EXTENT3D Gfx12Lib::Block4K_Log2_3d[]   = {{4, 4, 4}, {3, 4, 4}, {3, 4, 3}, {3, 3, 3}, {2, 3, 3}};
+const ADDR_EXTENT3D Gfx12Lib::Block64K_Log2_3d[]  = {{6, 5, 5}, {5, 5, 5}, {5, 5, 4}, {5, 4, 4}, {4, 4, 4}};
+const ADDR_EXTENT3D Gfx12Lib::Block256K_Log2_3d[] = {{6, 6, 6}, {5, 6, 6}, {5, 6, 5}, {5, 5, 5}, {4, 5, 5}};
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::Gfx12Lib
+*
+*   @brief
+*       Constructor
+*
+************************************************************************************************************************
+*/
+Gfx12Lib::Gfx12Lib(
+    const Client* pClient)
+    :
+    Lib(pClient),
+    m_numSwizzleBits(0)
+{
+    memset(&m_settings, 0, sizeof(m_settings));
+    memcpy(m_swizzleModeTable, SwizzleModeTable, sizeof(SwizzleModeTable));
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::~Gfx12Lib
+*
+*   @brief
+*       Destructor
+************************************************************************************************************************
+*/
+Gfx12Lib::~Gfx12Lib()
+{
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::ConvertSwizzlePatternToEquation
+*
+*   @brief
+*       Convert swizzle pattern to equation.
+*
+*   @return
+*       N/A
+************************************************************************************************************************
+*/
+VOID Gfx12Lib::ConvertSwizzlePatternToEquation(
+    UINT_32                elemLog2,  ///< [in] element bytes log2
+    Addr3SwizzleMode       swMode,    ///< [in] swizzle mode
+    const ADDR_SW_PATINFO* pPatInfo,  ///< [in] swizzle pattern info
+    ADDR_EQUATION*         pEquation) ///< [out] equation converted from swizzle pattern
+    const
+{
+    ADDR_BIT_SETTING fullSwizzlePattern[Log2Size256K];
+    GetSwizzlePatternFromPatternInfo(pPatInfo, fullSwizzlePattern);
+
+    const ADDR_BIT_SETTING* pSwizzle = fullSwizzlePattern;
+    const UINT_32           blockSizeLog2 = GetBlockSizeLog2(swMode, TRUE);
+
+    pEquation->numBits = blockSizeLog2;
+    pEquation->stackedDepthSlices = FALSE;
+
+    for (UINT_32 i = 0; i < elemLog2; i++)
+    {
+        pEquation->addr[i].channel = 0;
+        pEquation->addr[i].valid = 1;
+        pEquation->addr[i].index = i;
+    }
+
+    for (UINT_32 i = elemLog2; i < blockSizeLog2; i++)
+    {
+        ADDR_ASSERT(IsPow2(pSwizzle[i].value));
+
+        if (pSwizzle[i].x != 0)
+        {
+            ADDR_ASSERT(IsPow2(static_cast<UINT_32>(pSwizzle[i].x)));
+
+            pEquation->addr[i].channel = 0;
+            pEquation->addr[i].valid = 1;
+            pEquation->addr[i].index = Log2(pSwizzle[i].x) + elemLog2;
+        }
+        else if (pSwizzle[i].y != 0)
+        {
+            ADDR_ASSERT(IsPow2(static_cast<UINT_32>(pSwizzle[i].y)));
+
+            pEquation->addr[i].channel = 1;
+            pEquation->addr[i].valid = 1;
+            pEquation->addr[i].index = Log2(pSwizzle[i].y);
+        }
+        else if (pSwizzle[i].z != 0)
+        {
+            ADDR_ASSERT(IsPow2(static_cast<UINT_32>(pSwizzle[i].z)));
+
+            pEquation->addr[i].channel = 2;
+            pEquation->addr[i].valid = 1;
+            pEquation->addr[i].index = Log2(pSwizzle[i].z);
+        }
+        else if (pSwizzle[i].s != 0)
+        {
+            ADDR_ASSERT(IsPow2(static_cast<UINT_32>(pSwizzle[i].s)));
+
+            pEquation->addr[i].channel = 3;
+            pEquation->addr[i].valid = 1;
+            pEquation->addr[i].index = Log2(pSwizzle[i].s);
+        }
+        else
+        {
+            ADDR_ASSERT_ALWAYS();
+        }
+    }
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::InitEquationTable
+*
+*   @brief
+*       Initialize Equation table.
+*
+*   @return
+*       N/A
+************************************************************************************************************************
+*/
+VOID Gfx12Lib::InitEquationTable()
+{
+    memset(m_equationTable, 0, sizeof(m_equationTable));
+
+    for (UINT_32 swModeIdx = 0; swModeIdx < ADDR3_MAX_TYPE; swModeIdx++)
+    {
+        const Addr3SwizzleMode swMode = static_cast<Addr3SwizzleMode>(swModeIdx);
+
+        if (IsLinear(swMode))
+        {
+            // Skip linear equation (data table is not useful for 2D/3D images-- only contains x-coordinate bits)
+            continue;
+        }
+
+        const UINT_32 maxMsaa = Is2dSwizzle(swMode) ? MaxMsaaRateLog2 : 1;
+
+        for (UINT_32 msaaIdx = 0; msaaIdx < maxMsaa; msaaIdx++)
+        {
+            for (UINT_32 elemLog2 = 0; elemLog2 < MaxElementBytesLog2; elemLog2++)
+            {
+                UINT_32                equationIndex = ADDR_INVALID_EQUATION_INDEX;
+                const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(swMode, elemLog2, 1 << msaaIdx);
+
+                if (pPatInfo != NULL)
+                {
+                    ADDR_ASSERT(IsValidSwMode(swMode));
+
+                    ADDR_EQUATION equation = {};
+
+                    ConvertSwizzlePatternToEquation(elemLog2, swMode, pPatInfo, &equation);
+
+                    equationIndex = m_numEquations;
+                    ADDR_ASSERT(equationIndex < NumSwizzlePatterns);
+
+                    m_equationTable[equationIndex] = equation;
+                    m_numEquations++;
+                }
+                SetEquationTableEntry(swMode, msaaIdx, elemLog2, equationIndex);
+            }
+        }
+    }
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::GetBlockPixelDimensions
+*
+*   @brief
+*       Returns the pixel dimensions of one block.
+*
+************************************************************************************************************************
+*/
+ADDR_EXTENT3D  Gfx12Lib::GetBlockPixelDimensions(
+    Addr3SwizzleMode  swizzleMode,
+    UINT_32           log2BytesPerPixel
+    ) const
+{
+    ADDR_EXTENT3D  log2Dim = {};
+
+    switch (swizzleMode)
+    {
+        case ADDR3_4KB_3D:
+            log2Dim = Block4K_Log2_3d[log2BytesPerPixel];
+            break;
+        case ADDR3_64KB_3D:
+            log2Dim = Block64K_Log2_3d[log2BytesPerPixel];
+            break;
+        case ADDR3_256KB_3D:
+            log2Dim = Block256K_Log2_3d[log2BytesPerPixel];
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    return { 1u << log2Dim.width, 1u << log2Dim.height, 1u << log2Dim.depth };
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::GetMipOrigin
+*
+*   @brief
+*       Internal function to calculate origins of the mip levels
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+VOID Gfx12Lib::GetMipOrigin(
+     const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,        ///< [in] input structure
+     const ADDR_EXTENT3D&                    mipExtentFirstInTail,
+     ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut        ///< [out] output structure
+     ) const
+{
+    const BOOL_32        is3d           = Is3dSwizzle(pIn->swizzleMode);
+    const UINT_32        bytesPerPixel  = pIn->bpp >> 3;
+    const UINT_32        log2Bpp        = Log2(bytesPerPixel);
+    const ADDR_EXTENT3D  pixelBlockDims = GetBlockPixelDimensions(ADDR3_4KB_3D, log2Bpp);
+    const ADDR_EXTENT3D  tailMaxDim     = GetMipTailDim(pIn->swizzleMode,
+                                                        pOut->blockExtent);
+    const UINT_32        blockSizeLog2  = GetBlockSizeLog2(pIn->swizzleMode);
+    const UINT_32        maxMipsInTail  = GetMaxNumMipsInTail(pIn->swizzleMode, blockSizeLog2);
+
+    UINT_32 pitch  = tailMaxDim.width;
+    UINT_32 height = tailMaxDim.height;
+
+    UINT_32 depth  = (is3d ? PowTwoAlign(mipExtentFirstInTail.depth, pixelBlockDims.depth) : 1);
+
+    const UINT_32 tailMaxDepth   = (is3d ? (depth / pixelBlockDims.depth) : 1);
+
+    for (UINT_32 i = pOut->firstMipIdInTail; i < pIn->numMipLevels; i++)
+    {
+        INT_32  mipInTail = static_cast<INT_32>(i) - static_cast<INT_32>(pOut->firstMipIdInTail);
+        if ((mipInTail < 0) || (pIn->numMipLevels == 1))
+        {
+            mipInTail = MaxMipLevels;
+        }
+
+        // "m" can be negative
+        const INT_32  signedM   = static_cast<INT_32>(maxMipsInTail) - static_cast<INT_32>(1) - mipInTail;
+        const UINT_32 m         = Max(0, signedM);
+        const UINT_32 mipOffset = (m > 6) ? (16 << m) : (m << 8);
+
+        pOut->pMipInfo[i].offset           = mipOffset * tailMaxDepth;
+        pOut->pMipInfo[i].mipTailOffset    = mipOffset;
+        pOut->pMipInfo[i].macroBlockOffset = 0;
+
+        pOut->pMipInfo[i].pitch  = pitch;
+        pOut->pMipInfo[i].height = height;
+        pOut->pMipInfo[i].depth  = depth;
+
+        if (IsLinear(pIn->swizzleMode))
+        {
+            pOut->pMipInfo[i].mipTailCoordX = mipOffset >> 8;
+            pOut->pMipInfo[i].mipTailCoordY = 0;
+            pOut->pMipInfo[i].mipTailCoordZ = 0;
+
+            pitch = Max(pitch >> 1, 1u);
+        }
+        else
+        {
+            UINT_32 mipX = ((mipOffset >> 9)  & 1)  |
+                           ((mipOffset >> 10) & 2)  |
+                           ((mipOffset >> 11) & 4)  |
+                           ((mipOffset >> 12) & 8)  |
+                           ((mipOffset >> 13) & 16) |
+                           ((mipOffset >> 14) & 32);
+            UINT_32 mipY = ((mipOffset >> 8)  & 1)  |
+                           ((mipOffset >> 9)  & 2)  |
+                           ((mipOffset >> 10) & 4)  |
+                           ((mipOffset >> 11) & 8)  |
+                           ((mipOffset >> 12) & 16) |
+                           ((mipOffset >> 13) & 32);
+
+            if (is3d == FALSE)
+            {
+                pOut->pMipInfo[i].mipTailCoordX = mipX * Block256_2d[log2Bpp].w;
+                pOut->pMipInfo[i].mipTailCoordY = mipY * Block256_2d[log2Bpp].h;
+                pOut->pMipInfo[i].mipTailCoordZ = 0;
+
+                pitch  = Max(pitch  >> 1, Block256_2d[log2Bpp].w);
+                height = Max(height >> 1, Block256_2d[log2Bpp].h);
+                depth  = 1;
+            }
+            else
+            {
+                pOut->pMipInfo[i].mipTailCoordX = mipX * pixelBlockDims.width;
+                pOut->pMipInfo[i].mipTailCoordY = mipY * pixelBlockDims.height;
+                pOut->pMipInfo[i].mipTailCoordZ = 0;
+
+                pitch  = Max(pitch  >> 1, pixelBlockDims.width);
+                height = Max(height >> 1, pixelBlockDims.height);
+                depth  = PowTwoAlign(Max(depth >> 1, 1u), pixelBlockDims.depth);
+            }
+        }
+    }
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::GetMipOffset
+*
+*   @brief
+*       Internal function to calculate alignment for a surface
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+VOID Gfx12Lib::GetMipOffset(
+     const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,    ///< [in] input structure
+     ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    const UINT_32        bytesPerPixel = pIn->bpp >> 3;
+    const UINT_32        log2Bpp       = Log2(bytesPerPixel);
+    const UINT_32        blockSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
+    const UINT_32        blockSize     = 1 << blockSizeLog2;
+    const ADDR_EXTENT3D  tailMaxDim    = GetMipTailDim(pIn->swizzleMode,
+                                                       pOut->blockExtent);
+    const ADDR_EXTENT3D  mip0Dims      = GetBaseMipExtents(pIn);
+    const UINT_32        maxMipsInTail = GetMaxNumMipsInTail(pIn->swizzleMode, blockSizeLog2);
+
+    UINT_32       firstMipInTail    = pIn->numMipLevels;
+    UINT_64       mipChainSliceSize = 0;
+    UINT_64       mipSize[MaxMipLevels];
+    UINT_64       mipSliceSize[MaxMipLevels];
+
+    const ADDR_EXTENT3D fixedTailMaxDim = tailMaxDim;
+
+    for (UINT_32 mipIdx = 0; mipIdx < pIn->numMipLevels; mipIdx++)
+    {
+        const ADDR_EXTENT3D  mipExtents = GetMipExtent(mip0Dims, mipIdx);
+
+        if (SupportsMipTail(pIn->swizzleMode) &&
+            IsInMipTail(fixedTailMaxDim, mipExtents, maxMipsInTail, pIn->numMipLevels - mipIdx))
+        {
+            firstMipInTail     = mipIdx;
+            mipChainSliceSize += blockSize / pOut->blockExtent.depth;
+            break;
+        }
+        else
+        {
+            const UINT_32 pitch  = UseCustomPitch(pIn)
+                                        ? pOut->pitch
+                                        : ((mipIdx == 0) && CanTrimLinearPadding(pIn))
+                                          ? PowTwoAlign(mipExtents.width,  128u / bytesPerPixel)
+                                          : PowTwoAlign(mipExtents.width,  pOut->blockExtent.width);
+            const UINT_32 height = UseCustomHeight(pIn)
+                                        ? pOut->height
+                                        : PowTwoAlign(mipExtents.height, pOut->blockExtent.height);
+            const UINT_32 depth  = PowTwoAlign(mipExtents.depth,  pOut->blockExtent.depth);
+
+            // The original "blockExtent" calculation does subtraction of logs (i.e., division) to get the
+            // sizes.  We aligned our pitch and height to those sizes, which means we need to multiply the various
+            // factors back together to get back to the slice size.
+            const UINT_64 sliceSize = static_cast<UINT_64>(pitch) * height * pIn->numSamples * (pIn->bpp >> 3);
+
+            mipSize[mipIdx]       = sliceSize * depth;
+            mipSliceSize[mipIdx]  = sliceSize * pOut->blockExtent.depth;
+            mipChainSliceSize    += sliceSize;
+
+            if (pOut->pMipInfo != NULL)
+            {
+                pOut->pMipInfo[mipIdx].pitch  = pitch;
+                pOut->pMipInfo[mipIdx].height = height;
+                pOut->pMipInfo[mipIdx].depth  = depth;
+
+                // The slice size of a linear image was calculated above as if the "pitch" is 256 byte aligned.
+                // However, the rendering pitch is aligned to 128 bytes, and that is what needs to be reported
+                // to our clients.
+                if (IsLinear(pIn->swizzleMode))
+                {
+                    pOut->pMipInfo[mipIdx].pitch = PowTwoAlign(mipExtents.width,  128u / bytesPerPixel);
+                }
+            }
+        }
+    }
+
+    pOut->sliceSize        = mipChainSliceSize;
+    pOut->surfSize         = mipChainSliceSize * pOut->numSlices;
+    pOut->mipChainInTail   = (firstMipInTail == 0) ? TRUE : FALSE;
+    pOut->firstMipIdInTail = firstMipInTail;
+
+    if (pOut->pMipInfo != NULL)
+    {
+       if (IsLinear(pIn->swizzleMode))
+        {
+            // 1. Linear swizzle mode doesn't have miptails.
+            // 2. The organization of linear 3D mipmap resource is same as GFX11, we should use mip slice size to
+            // caculate mip offset.
+            ADDR_ASSERT(firstMipInTail == pIn->numMipLevels);
+
+            UINT_64 sliceSize = 0;
+
+            for (INT_32 i = static_cast<INT_32>(pIn->numMipLevels) - 1; i >= 0; i--)
+            {
+                pOut->pMipInfo[i].offset           = sliceSize;
+                pOut->pMipInfo[i].macroBlockOffset = sliceSize;
+                pOut->pMipInfo[i].mipTailOffset    = 0;
+
+                sliceSize += mipSliceSize[i];
+            }
+        }
+        else
+        {
+           UINT_64 offset         = 0;
+           UINT_64 macroBlkOffset = 0;
+           UINT_32 tailMaxDepth   = 0;
+
+           ADDR_EXTENT3D  mipExtentFirstInTail = {};
+           if (firstMipInTail != pIn->numMipLevels)
+           {
+              mipExtentFirstInTail = GetMipExtent(mip0Dims, firstMipInTail);
+
+              offset         = blockSize *
+                 PowTwoAlign(mipExtentFirstInTail.depth,
+                             pOut->blockExtent.depth) / pOut->blockExtent.depth;
+              macroBlkOffset = blockSize;
+           }
+
+           for (INT_32 i = firstMipInTail - 1; i >= 0; i--)
+           {
+              pOut->pMipInfo[i].offset           = offset;
+              pOut->pMipInfo[i].macroBlockOffset = macroBlkOffset;
+              pOut->pMipInfo[i].mipTailOffset    = 0;
+
+              offset         += mipSize[i];
+              macroBlkOffset += mipSliceSize[i];
+           }
+
+           GetMipOrigin(pIn, mipExtentFirstInTail, pOut);
+        }
+    }
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlComputeSurfaceInfo
+*
+*   @brief
+*       Internal function to calculate alignment for a surface
+*
+*   @return
+*       VOID
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceInfo(
+     const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,    ///< [in] input structure
+     ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    ComputeBlockDimensionForSurf(&pOut->blockExtent,
+                                 pIn->bpp,
+                                 pIn->numSamples,
+                                 pIn->swizzleMode);
+
+    ADDR_E_RETURNCODE  returnCode = ApplyCustomizedPitchHeight(pIn, pOut);
+
+    if (returnCode == ADDR_OK)
+    {
+        pOut->numSlices = PowTwoAlign(pIn->numSlices, pOut->blockExtent.depth);
+        pOut->baseAlign = 1 << GetBlockSizeLog2(pIn->swizzleMode);
+
+        GetMipOffset(pIn, pOut);
+
+        SanityCheckSurfSize(pIn, pOut);
+
+        // Slices must be exact multiples of the block sizes.  However:
+        // - with 3D images, one block will contain multiple slices, so that needs to be taken into account.
+        // - with linear images that have only once slice, we may trim and use the pitch alignment for size.
+        ADDR_ASSERT(((pOut->sliceSize * pOut->blockExtent.depth) %
+                     GetBlockSize(pIn->swizzleMode, CanTrimLinearPadding(pIn))) == 0);
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::GetBaseMipExtents
+*
+*   @brief
+*       Return the size of the base mip level in a nice cozy little structure.
+*
+************************************************************************************************************************
+*/
+ADDR_EXTENT3D Gfx12Lib::GetBaseMipExtents(
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn
+    ) const
+{
+    return { pIn->width,
+             pIn->height,
+             (IsTex3d(pIn->resourceType) ? pIn->numSlices : 1) }; // slices is depth for 3d
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::GetMaxNumMipsInTail
+*
+*   @brief
+*       Return max number of mips in tails
+*
+*   @return
+*       Max number of mips in tails
+************************************************************************************************************************
+*/
+UINT_32 Gfx12Lib::GetMaxNumMipsInTail(
+    Addr3SwizzleMode  swizzleMode,
+    UINT_32           blockSizeLog2     ///< block size log2
+    ) const
+{
+    UINT_32 effectiveLog2 = blockSizeLog2;
+    UINT_32 mipsInTail    = 1;
+
+    if (Is3dSwizzle(swizzleMode))
+    {
+        effectiveLog2 -= (blockSizeLog2 - 8) / 3;
+    }
+
+    if (effectiveLog2 > 8)
+    {
+        mipsInTail = (effectiveLog2 <= 11) ? (1 + (1 << (effectiveLog2 - 9))) : (effectiveLog2 - 4);
+    }
+
+    return mipsInTail;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled
+*
+*   @brief
+*       Internal function to calculate address from coord for tiled swizzle surface
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSurfaceAddrFromCoordTiled(
+     const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+     ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    // 256B block cannot support 3D image.
+    ADDR_ASSERT((IsTex3d(pIn->resourceType) && IsBlock256b(pIn->swizzleMode)) == FALSE);
+
+    ADDR3_COMPUTE_SURFACE_INFO_INPUT  localIn = {};
+    ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {};
+    ADDR3_MIP_INFO                    mipInfo[MaxMipLevels];
+
+    localIn.size         = sizeof(localIn);
+    localIn.flags        = pIn->flags;
+    localIn.swizzleMode  = pIn->swizzleMode;
+    localIn.resourceType = pIn->resourceType;
+    localIn.format       = ADDR_FMT_INVALID;
+    localIn.bpp          = pIn->bpp;
+    localIn.width        = Max(pIn->unAlignedDims.width, 1u);
+    localIn.height       = Max(pIn->unAlignedDims.height, 1u);
+    localIn.numSlices    = Max(pIn->unAlignedDims.depth, 1u);
+    localIn.numMipLevels = Max(pIn->numMipLevels, 1u);
+    localIn.numSamples   = Max(pIn->numSamples, 1u);
+
+    localOut.size        = sizeof(localOut);
+    localOut.pMipInfo    = mipInfo;
+
+    ADDR_E_RETURNCODE ret = ComputeSurfaceInfo(&localIn, &localOut);
+
+    if (ret == ADDR_OK)
+    {
+        const UINT_32 elemLog2    = Log2(pIn->bpp >> 3);
+        const UINT_32 blkSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
+        const UINT_32 eqIndex     = GetEquationTableEntry(pIn->swizzleMode, Log2(localIn.numSamples), elemLog2);
+
+        if (eqIndex != ADDR_INVALID_EQUATION_INDEX)
+        {
+            const BOOL_32 inTail     = ((mipInfo[pIn->mipId].mipTailOffset != 0) && (blkSizeLog2 != Log2Size256));
+            const BOOL_32 is3dNoMsaa = ((IsTex3d(pIn->resourceType) == TRUE) && (localIn.numSamples == 1));
+            const UINT_64 sliceSize  = is3dNoMsaa ? (localOut.sliceSize * localOut.blockExtent.depth)
+                                                  : localOut.sliceSize;
+            const UINT_32 sliceId    = is3dNoMsaa ? (pIn->slice / localOut.blockExtent.depth) : pIn->slice;
+            const UINT_32 x          = inTail ? (pIn->x + mipInfo[pIn->mipId].mipTailCoordX) : pIn->x;
+            const UINT_32 y          = inTail ? (pIn->y + mipInfo[pIn->mipId].mipTailCoordY) : pIn->y;
+            const UINT_32 z          = inTail ? (pIn->slice + mipInfo[pIn->mipId].mipTailCoordZ) : pIn->slice;
+            const UINT_32 pb         = mipInfo[pIn->mipId].pitch / localOut.blockExtent.width;
+            const UINT_32 yb         = pIn->y / localOut.blockExtent.height;
+            const UINT_32 xb         = pIn->x / localOut.blockExtent.width;
+            const UINT_64 blkIdx     = yb * pb + xb;
+            const UINT_32 blkOffset  = ComputeOffsetFromEquation(&m_equationTable[eqIndex],
+                                                                 x << elemLog2,
+                                                                 y,
+                                                                 z,
+                                                                 pIn->sample);
+            pOut->addr = sliceSize * sliceId +
+                         mipInfo[pIn->mipId].macroBlockOffset +
+                         (blkIdx << blkSizeLog2) +
+                         blkOffset;
+        }
+        else
+        {
+            ret = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    return ret;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlComputePipeBankXor
+*
+*   @brief
+*       Generate a PipeBankXor value to be ORed into bits above numSwizzleBits of address
+*
+*   @return
+*       PipeBankXor value
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx12Lib::HwlComputePipeBankXor(
+    const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,     ///< [in] input structure
+    ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut     ///< [out] output structure
+    ) const
+{
+    if ((m_numSwizzleBits != 0)               && // does this configuration support swizzling
+        //         base address XOR in GFX12 will be applied to all blk_size = 4KB, 64KB, or 256KB swizzle modes,
+        //         Note that Linear and 256B are excluded.
+        (IsLinear(pIn->swizzleMode) == FALSE) &&
+        (IsBlock256b(pIn->swizzleMode) == FALSE))
+    {
+        pOut->pipeBankXor = pIn->surfIndex % (1 << m_numSwizzleBits);
+    }
+    else
+    {
+        pOut->pipeBankXor = 0;
+    }
+
+    return ADDR_OK;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::ComputeOffsetFromEquation
+*
+*   @brief
+*       Compute offset from equation
+*
+*   @return
+*       Offset
+************************************************************************************************************************
+*/
+UINT_32 Gfx12Lib::ComputeOffsetFromEquation(
+    const ADDR_EQUATION* pEq,   ///< Equation
+    UINT_32              x,     ///< x coord in bytes
+    UINT_32              y,     ///< y coord in pixel
+    UINT_32              z,     ///< z coord in slice
+    UINT_32              s      ///< MSAA sample index
+    ) const
+{
+    UINT_32 offset = 0;
+
+    for (UINT_32 i = 0; i < pEq->numBits; i++)
+    {
+        UINT_32 v = 0;
+
+        if (pEq->addr[i].valid)
+        {
+            if (pEq->addr[i].channel == 0)
+            {
+                v ^= (x >> pEq->addr[i].index) & 1;
+            }
+            else if (pEq->addr[i].channel == 1)
+            {
+                v ^= (y >> pEq->addr[i].index) & 1;
+            }
+            else if (pEq->addr[i].channel == 2)
+            {
+                v ^= (z >> pEq->addr[i].index) & 1;
+            }
+            else if (pEq->addr[i].channel == 3)
+            {
+                v ^= (s >> pEq->addr[i].index) & 1;
+            }
+            else
+            {
+                ADDR_ASSERT_ALWAYS();
+            }
+        }
+
+        offset |= (v << i);
+    }
+
+    return offset;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::GetSwizzlePatternInfo
+*
+*   @brief
+*       Get swizzle pattern
+*
+*   @return
+*       Swizzle pattern information
+************************************************************************************************************************
+*/
+const ADDR_SW_PATINFO* Gfx12Lib::GetSwizzlePatternInfo(
+    Addr3SwizzleMode swizzleMode,       ///< Swizzle mode
+    UINT_32          elemLog2,          ///< Element size in bytes log2
+    UINT_32          numFrag            ///< Number of fragment
+    ) const
+{
+    const ADDR_SW_PATINFO* patInfo = NULL;
+
+    if (Is2dSwizzle(swizzleMode) == FALSE)
+    {
+        ADDR_ASSERT(numFrag == 1);
+    }
+
+    switch (swizzleMode)
+    {
+    case ADDR3_256KB_2D:
+        switch (numFrag)
+        {
+        case 1:
+            patInfo = GFX12_SW_256KB_2D_1xAA_PATINFO;
+            break;
+        case 2:
+            patInfo = GFX12_SW_256KB_2D_2xAA_PATINFO;
+            break;
+        case 4:
+            patInfo = GFX12_SW_256KB_2D_4xAA_PATINFO;
+            break;
+        case 8:
+            patInfo = GFX12_SW_256KB_2D_8xAA_PATINFO;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+        }
+        break;
+    case ADDR3_256KB_3D:
+        patInfo = GFX12_SW_256KB_3D_PATINFO;
+        break;
+    case ADDR3_64KB_2D:
+        switch (numFrag)
+        {
+        case 1:
+            patInfo = GFX12_SW_64KB_2D_1xAA_PATINFO;
+            break;
+        case 2:
+            patInfo = GFX12_SW_64KB_2D_2xAA_PATINFO;
+            break;
+        case 4:
+            patInfo = GFX12_SW_64KB_2D_4xAA_PATINFO;
+            break;
+        case 8:
+            patInfo = GFX12_SW_64KB_2D_8xAA_PATINFO;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+        }
+        break;
+    case ADDR3_64KB_3D:
+        patInfo = GFX12_SW_64KB_3D_PATINFO;
+        break;
+    case ADDR3_4KB_2D:
+        switch (numFrag)
+        {
+        case 1:
+            patInfo = GFX12_SW_4KB_2D_1xAA_PATINFO;
+            break;
+        case 2:
+            patInfo = GFX12_SW_4KB_2D_2xAA_PATINFO;
+            break;
+        case 4:
+            patInfo = GFX12_SW_4KB_2D_4xAA_PATINFO;
+            break;
+        case 8:
+            patInfo = GFX12_SW_4KB_2D_8xAA_PATINFO;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+        }
+        break;
+    case ADDR3_4KB_3D:
+        patInfo = GFX12_SW_4KB_3D_PATINFO;
+        break;
+    case ADDR3_256B_2D:
+        switch (numFrag)
+        {
+        case 1:
+            patInfo = GFX12_SW_256B_2D_1xAA_PATINFO;
+            break;
+        case 2:
+            patInfo = GFX12_SW_256B_2D_2xAA_PATINFO;
+            break;
+        case 4:
+            patInfo = GFX12_SW_256B_2D_4xAA_PATINFO;
+            break;
+        case 8:
+            patInfo = GFX12_SW_256B_2D_8xAA_PATINFO;
+            break;
+        default:
+            break;
+        }
+        break;
+    default:
+        ADDR_ASSERT_ALWAYS();
+        break;
+    }
+
+    return (patInfo != NULL) ? &patInfo[elemLog2] : NULL;
+}
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlInitGlobalParams
+*
+*   @brief
+*       Initializes global parameters
+*
+*   @return
+*       TRUE if all settings are valid
+*
+************************************************************************************************************************
+*/
+BOOL_32 Gfx12Lib::HwlInitGlobalParams(
+    const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
+{
+    BOOL_32              valid = TRUE;
+    GB_ADDR_CONFIG_GFX12 gbAddrConfig;
+
+    gbAddrConfig.u32All = pCreateIn->regValue.gbAddrConfig;
+
+    switch (gbAddrConfig.bits.NUM_PIPES)
+    {
+        case ADDR_CONFIG_1_PIPE:
+            m_pipesLog2 = 0;
+            break;
+        case ADDR_CONFIG_2_PIPE:
+            m_pipesLog2 = 1;
+            break;
+        case ADDR_CONFIG_4_PIPE:
+            m_pipesLog2 = 2;
+            break;
+        case ADDR_CONFIG_8_PIPE:
+            m_pipesLog2 = 3;
+            break;
+        case ADDR_CONFIG_16_PIPE:
+            m_pipesLog2 = 4;
+            break;
+        case ADDR_CONFIG_32_PIPE:
+            m_pipesLog2 = 5;
+            break;
+        case ADDR_CONFIG_64_PIPE:
+            m_pipesLog2 = 6;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            valid = FALSE;
+            break;
+    }
+
+    switch (gbAddrConfig.bits.PIPE_INTERLEAVE_SIZE)
+    {
+        case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
+            m_pipeInterleaveLog2 = 8;
+            break;
+        case ADDR_CONFIG_PIPE_INTERLEAVE_512B:
+            m_pipeInterleaveLog2 = 9;
+            break;
+        case ADDR_CONFIG_PIPE_INTERLEAVE_1KB:
+            m_pipeInterleaveLog2 = 10;
+            break;
+        case ADDR_CONFIG_PIPE_INTERLEAVE_2KB:
+            m_pipeInterleaveLog2 = 11;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            valid = FALSE;
+            break;
+    }
+
+    m_numSwizzleBits = ((m_pipesLog2 >= 3) ? m_pipesLog2 - 2 : 0);
+
+    if (valid)
+    {
+        InitEquationTable();
+    }
+
+    return valid;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlComputeNonBlockCompressedView
+*
+*   @brief
+*       Compute non-block-compressed view for a given mipmap level/slice.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx12Lib::HwlComputeNonBlockCompressedView(
+    const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,    ///< [in] input structure
+    ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (((pIn->format < ADDR_FMT_ASTC_4x4) || (pIn->format > ADDR_FMT_ETC2_128BPP)) &&
+        ((pIn->format < ADDR_FMT_BC1) || (pIn->format > ADDR_FMT_BC7)))
+    {
+        // Only support BC1~BC7, ASTC, or ETC2 for now...
+        returnCode = ADDR_NOTSUPPORTED;
+    }
+    else
+    {
+        UINT_32 bcWidth, bcHeight;
+        const UINT_32 bpp = GetElemLib()->GetBitsPerPixel(pIn->format, NULL, &bcWidth, &bcHeight);
+
+        ADDR3_COMPUTE_SURFACE_INFO_INPUT infoIn = {};
+        infoIn.size         = sizeof(infoIn);
+        infoIn.flags        = pIn->flags;
+        infoIn.swizzleMode  = pIn->swizzleMode;
+        infoIn.resourceType = pIn->resourceType;
+        infoIn.format       = pIn->format;
+        infoIn.bpp          = bpp;
+        infoIn.width        = RoundUpQuotient(pIn->unAlignedDims.width, bcWidth);
+        infoIn.height       = RoundUpQuotient(pIn->unAlignedDims.height, bcHeight);
+        infoIn.numSlices    = pIn->unAlignedDims.depth;
+        infoIn.numMipLevels = pIn->numMipLevels;
+        infoIn.numSamples   = 1;
+
+        ADDR3_MIP_INFO mipInfo[MaxMipLevels] = {};
+
+        ADDR3_COMPUTE_SURFACE_INFO_OUTPUT infoOut = {};
+        infoOut.size     = sizeof(infoOut);
+        infoOut.pMipInfo = mipInfo;
+
+        returnCode = HwlComputeSurfaceInfo(&infoIn, &infoOut);
+
+        if (returnCode == ADDR_OK)
+        {
+            ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT subOffIn = {};
+            subOffIn.size             = sizeof(subOffIn);
+            subOffIn.swizzleMode      = infoIn.swizzleMode;
+            subOffIn.resourceType     = infoIn.resourceType;
+            subOffIn.pipeBankXor      = pIn->pipeBankXor;
+            subOffIn.slice            = pIn->slice;
+            subOffIn.sliceSize        = infoOut.sliceSize;
+            subOffIn.macroBlockOffset = mipInfo[pIn->mipId].macroBlockOffset;
+            subOffIn.mipTailOffset    = mipInfo[pIn->mipId].mipTailOffset;
+
+            ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT subOffOut = {};
+            subOffOut.size = sizeof(subOffOut);
+
+            // For any mipmap level, move nonBc view base address by offset
+            HwlComputeSubResourceOffsetForSwizzlePattern(&subOffIn, &subOffOut);
+            pOut->offset = subOffOut.offset;
+
+            ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT slicePbXorIn = {};
+            slicePbXorIn.size            = sizeof(slicePbXorIn);
+            slicePbXorIn.swizzleMode     = infoIn.swizzleMode;
+            slicePbXorIn.resourceType    = infoIn.resourceType;
+            slicePbXorIn.bpe             = infoIn.bpp;
+            slicePbXorIn.basePipeBankXor = pIn->pipeBankXor;
+            slicePbXorIn.slice           = pIn->slice;
+            slicePbXorIn.numSamples      = 1;
+
+            ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT slicePbXorOut = {};
+            slicePbXorOut.size = sizeof(slicePbXorOut);
+
+            // For any mipmap level, nonBc view should use computed pbXor
+            HwlComputeSlicePipeBankXor(&slicePbXorIn, &slicePbXorOut);
+            pOut->pipeBankXor = slicePbXorOut.pipeBankXor;
+
+            const BOOL_32 tiled            = (pIn->swizzleMode != ADDR3_LINEAR);
+            const BOOL_32 inTail           = tiled && (pIn->mipId >= infoOut.firstMipIdInTail);
+            const UINT_32 requestMipWidth  =
+                    RoundUpQuotient(Max(pIn->unAlignedDims.width  >> pIn->mipId, 1u), bcWidth);
+            const UINT_32 requestMipHeight =
+                    RoundUpQuotient(Max(pIn->unAlignedDims.height >> pIn->mipId, 1u), bcHeight);
+
+            if (inTail)
+            {
+                // For mipmap level that is in mip tail block, hack a lot of things...
+                // Basically all mipmap levels in tail block will be viewed as a small mipmap chain that all levels
+                // are fit in tail block:
+
+                // - mipId = relative mip id (which is counted from first mip ID in tail in original mip chain)
+                pOut->mipId = pIn->mipId - infoOut.firstMipIdInTail;
+
+                // - at least 2 mipmap levels (since only 1 mipmap level will not be viewed as mipmap!)
+                pOut->numMipLevels = Max(infoIn.numMipLevels - infoOut.firstMipIdInTail, 2u);
+
+                // - (mip0) width = requestMipWidth << mipId, the value can't exceed mip tail dimension threshold
+                pOut->unAlignedDims.width  = Min(requestMipWidth << pOut->mipId, infoOut.blockExtent.width / 2);
+
+                // - (mip0) height = requestMipHeight << mipId, the value can't exceed mip tail dimension threshold
+                pOut->unAlignedDims.height = Min(requestMipHeight << pOut->mipId, infoOut.blockExtent.height);
+            }
+            // This check should cover at least mipId == 0
+            else if ((requestMipWidth << pIn->mipId) == infoIn.width)
+            {
+                // For mipmap level [N] that is not in mip tail block and downgraded without losing element:
+                // - only one mipmap level and mipId = 0
+                pOut->mipId        = 0;
+                pOut->numMipLevels = 1;
+
+                // (mip0) width = requestMipWidth
+                pOut->unAlignedDims.width  = requestMipWidth;
+
+                // (mip0) height = requestMipHeight
+                pOut->unAlignedDims.height = requestMipHeight;
+            }
+            else
+            {
+                // For mipmap level [N] that is not in mip tail block and downgraded with element losing,
+                // We have to make it a multiple mipmap view (2 levels view here), add one extra element if needed,
+                // because single mip view may have different pitch value than original (multiple) mip view...
+                // A simple case would be:
+                // - 64KB block swizzle mode, 8 Bytes-Per-Element. Block dim = [0x80, 0x40]
+                // - 2 mipmap levels with API mip0 width = 0x401/mip1 width = 0x200 and non-BC view
+                //   mip0 width = 0x101/mip1 width = 0x80
+                // By multiple mip view, the pitch for mip level 1 would be 0x100 bytes, due to rounding up logic in
+                // GetMipSize(), and by single mip level view the pitch will only be 0x80 bytes.
+
+                // - 2 levels and mipId = 1
+                pOut->mipId        = 1;
+                pOut->numMipLevels = 2;
+
+                const UINT_32 upperMipWidth  =
+                    RoundUpQuotient(Max(pIn->unAlignedDims.width  >> (pIn->mipId - 1), 1u), bcWidth);
+                const UINT_32 upperMipHeight =
+                    RoundUpQuotient(Max(pIn->unAlignedDims.height >> (pIn->mipId - 1), 1u), bcHeight);
+
+                const BOOL_32 needToAvoidInTail = tiled                                              &&
+                                                  (requestMipWidth <= infoOut.blockExtent.width / 2) &&
+                                                  (requestMipHeight <= infoOut.blockExtent.height);
+
+                const UINT_32 hwMipWidth  =
+                    PowTwoAlign(ShiftCeil(infoIn.width, pIn->mipId), infoOut.blockExtent.width);
+                const UINT_32 hwMipHeight =
+                    PowTwoAlign(ShiftCeil(infoIn.height, pIn->mipId), infoOut.blockExtent.height);
+
+                const BOOL_32 needExtraWidth =
+                    ((upperMipWidth < requestMipWidth * 2) ||
+                     ((upperMipWidth == requestMipWidth * 2) &&
+                      ((needToAvoidInTail == TRUE) ||
+                       (hwMipWidth > PowTwoAlign(requestMipWidth, infoOut.blockExtent.width)))));
+
+                const BOOL_32 needExtraHeight =
+                    ((upperMipHeight < requestMipHeight * 2) ||
+                     ((upperMipHeight == requestMipHeight * 2) &&
+                      ((needToAvoidInTail == TRUE) ||
+                       (hwMipHeight > PowTwoAlign(requestMipHeight, infoOut.blockExtent.height)))));
+
+                // (mip0) width = requestLastMipLevelWidth
+                pOut->unAlignedDims.width  = upperMipWidth + (needExtraWidth ? 1: 0);
+
+                // (mip0) height = requestLastMipLevelHeight
+                pOut->unAlignedDims.height = upperMipHeight + (needExtraHeight ? 1: 0);
+            }
+
+            // Assert the downgrading from this mip[0] width would still generate correct mip[N] width
+            ADDR_ASSERT(ShiftRight(pOut->unAlignedDims.width, pOut->mipId)  == requestMipWidth);
+            // Assert the downgrading from this mip[0] height would still generate correct mip[N] height
+            ADDR_ASSERT(ShiftRight(pOut->unAlignedDims.height, pOut->mipId) == requestMipHeight);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlComputeSubResourceOffsetForSwizzlePattern
+*
+*   @brief
+*       Compute sub resource offset to support swizzle pattern
+*
+*   @return
+*       VOID
+************************************************************************************************************************
+*/
+VOID Gfx12Lib::HwlComputeSubResourceOffsetForSwizzlePattern(
+    const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,    ///< [in] input structure
+    ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    pOut->offset = pIn->slice * pIn->sliceSize + pIn->macroBlockOffset;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::HwlComputeSlicePipeBankXor
+*
+*   @brief
+*       Generate slice PipeBankXor value based on base PipeBankXor value and slice id
+*
+*   @return
+*       PipeBankXor value
+************************************************************************************************************************
+*/
+ADDR_E_RETURNCODE Gfx12Lib::HwlComputeSlicePipeBankXor(
+    const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,   ///< [in] input structure
+    ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut   ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    // PipeBankXor is only applied to 4KB, 64KB and 256KB on GFX12.
+    if ((IsLinear(pIn->swizzleMode) == FALSE) && (IsBlock256b(pIn->swizzleMode) == FALSE))
+    {
+        if (pIn->bpe == 0)
+        {
+            // Require a valid bytes-per-element value passed from client...
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            const ADDR_SW_PATINFO* pPatInfo = GetSwizzlePatternInfo(pIn->swizzleMode,
+                                                                    Log2(pIn->bpe >> 3),
+                                                                    1);
+
+            if (pPatInfo != NULL)
+            {
+                const UINT_32 elemLog2    = Log2(pIn->bpe >> 3);
+                const UINT_32 eqIndex     = GetEquationTableEntry(pIn->swizzleMode, Log2(pIn->numSamples), elemLog2);
+
+                const UINT_32 pipeBankXorOffset = ComputeOffsetFromEquation(&m_equationTable[eqIndex],
+                                                                            0,
+                                                                            0,
+                                                                            pIn->slice,
+                                                                            0);
+
+                const UINT_32 pipeBankXor = pipeBankXorOffset >> m_pipeInterleaveLog2;
+
+                // Should have no bit set under pipe interleave
+                ADDR_ASSERT((pipeBankXor << m_pipeInterleaveLog2) == pipeBankXorOffset);
+
+                pOut->pipeBankXor = pIn->basePipeBankXor ^ pipeBankXor;
+            }
+            else
+            {
+                // Should never come here...
+                ADDR_NOT_IMPLEMENTED();
+
+                returnCode = ADDR_NOTSUPPORTED;
+            }
+        }
+    }
+    else
+    {
+        pOut->pipeBankXor = 0;
+    }
+
+    return returnCode;
+}
+
+/**
+************************************************************************************************************************
+*   Gfx12Lib::SanityCheckSurfSize
+*
+*   @brief
+*       Calculate the surface size via the exact hardware algorithm to see if it matches.
+*
+*   @return
+************************************************************************************************************************
+*/
+void Gfx12Lib::SanityCheckSurfSize(
+    const ADDR3_COMPUTE_SURFACE_INFO_INPUT*   pIn,
+    const ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*  pOut
+    ) const
+{
+#if DEBUG
+    // Verify that the requested image size is valid for the below algorithm.  The below code includes
+    // implicit assumptions about the surface dimensions being less than "MaxImageDim"; otherwise, it can't
+    // calculate "firstMipInTail" accurately and the below assertion will trip incorrectly.
+    //
+    // Surfaces destined for use only on the SDMA engine can exceed the gfx-engine-imposed limitations of
+    // the "maximum" image dimensions.
+    if ((pIn->width  <= MaxImageDim)        &&
+        (pIn->height <= MaxImageDim)        &&
+        (pIn->numMipLevels <= MaxMipLevels) &&
+        (UseCustomPitch(pIn)  == FALSE)     &&
+        (UseCustomHeight(pIn) == FALSE)     &&
+        // HiZS surfaces have a reduced image size (i.e,. each pixel represents an 8x8 region of the parent
+        // image, at least for single samples) but they still have the same number of mip levels as the
+        // parent image.  This disconnect produces false assertions below as the image size doesn't apparently
+        // support the specified number of mip levels.
+        ((pIn->flags.hiZHiS == 0) || (pIn->numMipLevels == 1)))
+    {
+        UINT_32  lastMipSize   = 1;
+        UINT_32  dataChainSize = 0;
+
+        const ADDR_EXTENT3D  mip0Dims      = GetBaseMipExtents(pIn);
+        const UINT_32        blockSizeLog2 = GetBlockSizeLog2(pIn->swizzleMode);
+        const ADDR_EXTENT3D  tailMaxDim    = GetMipTailDim(pIn->swizzleMode, pOut->blockExtent);
+        const UINT_32        maxMipsInTail = GetMaxNumMipsInTail(pIn->swizzleMode, blockSizeLog2);
+
+        UINT_32  firstMipInTail = 0;
+        for (INT_32 mipIdx = MaxMipLevels - 1; mipIdx >= 0; mipIdx--)
+        {
+            const ADDR_EXTENT3D  mipExtents = GetMipExtent(mip0Dims, mipIdx);
+
+            if ((mipExtents.width  <= tailMaxDim.width)  &&
+                (mipExtents.height <= tailMaxDim.height) &&
+                ((static_cast<INT_32>(pIn->numMipLevels) - mipIdx) < static_cast<INT_32>(maxMipsInTail)))
+            {
+                firstMipInTail = mipIdx;
+            }
+        }
+
+        for (INT_32  mipIdx = firstMipInTail - 1; mipIdx >= -1; mipIdx--)
+        {
+            const ADDR_EXTENT3D  mipExtents     = GetMipExtent(mip0Dims, mipIdx);
+            const UINT_32        mipBlockWidth  = ShiftCeil(mipExtents.width,  Log2(pOut->blockExtent.width));
+            const UINT_32        mipBlockHeight = ShiftCeil(mipExtents.height, Log2(pOut->blockExtent.height));
+
+            if (mipIdx < (static_cast<INT_32>(pIn->numMipLevels) - 1))
+            {
+                dataChainSize += lastMipSize;
+            }
+
+            if (mipIdx >= 0)
+            {
+                lastMipSize = 4 * lastMipSize
+                    - ((mipBlockWidth  & 1) ? mipBlockHeight : 0)
+                    - ((mipBlockHeight & 1) ? mipBlockWidth  : 0)
+                    - ((mipBlockWidth  & mipBlockHeight & 1) ? 1 : 0);
+            }
+        }
+
+        if (CanTrimLinearPadding(pIn))
+        {
+            ADDR_ASSERT((pOut->sliceSize * pOut->blockExtent.depth) <= (dataChainSize << blockSizeLog2));
+        }
+        else
+        {
+            ADDR_ASSERT((pOut->sliceSize * pOut->blockExtent.depth) == (dataChainSize << blockSizeLog2));
+        }
+    }
+#endif
+}
+
+} // V3
+} // Addr
+} // namespace rocr
diff --git a/src/image/addrlib/src/gfx12/gfx12addrlib.h b/src/image/addrlib/src/gfx12/gfx12addrlib.h
new file mode 100644
index 000000000..902a60b82
--- /dev/null
+++ b/src/image/addrlib/src/gfx12/gfx12addrlib.h
@@ -0,0 +1,218 @@
+/*
+************************************************************************************************************************
+*
+*  Copyright (C) 2023 Advanced Micro Devices, Inc.  All rights reserved.
+*  SPDX-License-Identifier: MIT
+*
+***********************************************************************************************************************/
+
+/**
+************************************************************************************************************************
+* @file  gfx12addrlib.h
+* @brief Contains the Gfx12Lib class definition.
+************************************************************************************************************************
+*/
+
+#ifndef __GFX12_ADDR_LIB_H__
+#define __GFX12_ADDR_LIB_H__
+
+#include "addrlib3.h"
+#include "coord.h"
+#include "gfx12SwizzlePattern.h"
+
+namespace rocr {
+namespace Addr
+{
+namespace V3
+{
+
+/**
+************************************************************************************************************************
+* @brief GFX12 specific settings structure.
+************************************************************************************************************************
+*/
+struct Gfx12ChipSettings
+{
+    struct
+    {
+        // Misc configuration bits
+        UINT_32 reserved : 32;
+    };
+};
+
+/**
+************************************************************************************************************************
+* @brief GFX12 data surface type.
+************************************************************************************************************************
+*/
+
+/**
+************************************************************************************************************************
+* @brief This class is the GFX12 specific address library
+*        function set.
+************************************************************************************************************************
+*/
+class Gfx12Lib : public Lib
+{
+public:
+    /// Creates Gfx12Lib object
+    static Addr::Lib* CreateObj(const Client* pClient)
+    {
+        VOID* pMem = Object::ClientAlloc(sizeof(Gfx12Lib), pClient);
+        return (pMem != NULL) ? new (pMem) Gfx12Lib(pClient) : NULL;
+    }
+
+protected:
+    Gfx12Lib(const Client* pClient);
+    virtual ~Gfx12Lib();
+
+    // Meta surfaces such as Hi-S/Z are essentially images on GFX12, so just return the max
+    // image alignment.
+    virtual UINT_32 HwlComputeMaxMetaBaseAlignments() const { return 256 * 1024; }
+
+    UINT_32 GetMaxNumMipsInTail(
+        Addr3SwizzleMode  swizzleMode,
+        UINT_32           blockSizeLog2) const;
+
+    BOOL_32 IsInMipTail(
+        const ADDR_EXTENT3D&  mipTailDim,
+        const ADDR_EXTENT3D&  mipDims,
+        UINT_32               maxNumMipsInTail,
+        UINT_32               numMipsToTheEnd) const
+    {
+        BOOL_32 inTail = ((mipDims.width   <= mipTailDim.width)  &&
+                          (mipDims.height  <= mipTailDim.height) &&
+                          (numMipsToTheEnd <= maxNumMipsInTail));
+
+        return inTail;
+    }
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoordTiled(
+        const ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR3_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeNonBlockCompressedView(
+        const ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT* pIn,
+        ADDR3_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT*      pOut) const;
+
+    virtual VOID HwlComputeSubResourceOffsetForSwizzlePattern(
+        const ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_INPUT* pIn,
+        ADDR3_COMPUTE_SUBRESOURCE_OFFSET_FORSWIZZLEPATTERN_OUTPUT*      pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSlicePipeBankXor(
+        const ADDR3_COMPUTE_SLICE_PIPEBANKXOR_INPUT* pIn,
+        ADDR3_COMPUTE_SLICE_PIPEBANKXOR_OUTPUT*      pOut) const;
+
+    virtual UINT_32 HwlGetEquationTableInfo(const ADDR_EQUATION** ppEquationTable) const
+    {
+        *ppEquationTable = m_equationTable;
+
+        return m_numEquations;
+    }
+
+private:
+    Gfx12ChipSettings m_settings;
+    static const SwizzleModeFlags SwizzleModeTable[ADDR3_MAX_TYPE];
+
+    virtual ADDR_E_RETURNCODE HwlComputePipeBankXor(
+        const ADDR3_COMPUTE_PIPEBANKXOR_INPUT* pIn,
+        ADDR3_COMPUTE_PIPEBANKXOR_OUTPUT*      pOut) const override;
+
+    virtual BOOL_32 HwlInitGlobalParams(const ADDR_CREATE_INPUT* pCreateIn) override;
+
+    void SanityCheckSurfSize(
+        const ADDR3_COMPUTE_SURFACE_INFO_INPUT*   pIn,
+        const ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*  pOut) const;
+
+    UINT_32           m_numSwizzleBits;
+
+    static const ADDR_EXTENT3D Block4K_Log2_3d[];
+    static const ADDR_EXTENT3D Block64K_Log2_3d[];
+    static const ADDR_EXTENT3D Block256K_Log2_3d[];
+
+    // Initialize equation table
+    VOID InitEquationTable();
+
+    VOID GetSwizzlePatternFromPatternInfo(
+        const ADDR_SW_PATINFO* pPatInfo,
+        ADDR_BIT_SETTING       (&pSwizzle)[Log2Size256K]) const
+    {
+        memcpy(pSwizzle,
+               GFX12_SW_PATTERN_NIBBLE1[pPatInfo->nibble1Idx],
+               sizeof(GFX12_SW_PATTERN_NIBBLE1[pPatInfo->nibble1Idx]));
+
+        memcpy(&pSwizzle[8],
+               GFX12_SW_PATTERN_NIBBLE2[pPatInfo->nibble2Idx],
+               sizeof(GFX12_SW_PATTERN_NIBBLE2[pPatInfo->nibble2Idx]));
+
+        memcpy(&pSwizzle[12],
+               GFX12_SW_PATTERN_NIBBLE3[pPatInfo->nibble3Idx],
+               sizeof(GFX12_SW_PATTERN_NIBBLE3[pPatInfo->nibble3Idx]));
+
+        memcpy(&pSwizzle[16],
+               GFX12_SW_PATTERN_NIBBLE4[pPatInfo->nibble4Idx],
+               sizeof(GFX12_SW_PATTERN_NIBBLE4[pPatInfo->nibble4Idx]));
+    }
+
+    VOID ConvertSwizzlePatternToEquation(
+        UINT_32                elemLog2,
+        Addr3SwizzleMode       swMode,
+        const ADDR_SW_PATINFO* pPatInfo,
+        ADDR_EQUATION* pEquation) const;
+
+    ADDR_EXTENT3D GetBaseMipExtents(
+        const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    ADDR_EXTENT3D GetBlockPixelDimensions(
+        Addr3SwizzleMode  swizzleMode,
+        UINT_32           log2BytesPerPixel) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+         const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,
+         ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut) const override;
+
+    static ADDR_EXTENT3D GetMipExtent(
+        const ADDR_EXTENT3D&  mip0,
+        UINT_32               mipId)
+    {
+        return {
+            ShiftCeil(Max(mip0.width, 1u),  mipId),
+            ShiftCeil(Max(mip0.height, 1u), mipId),
+            ShiftCeil(Max(mip0.depth, 1u),  mipId)
+        };
+    }
+
+    //# See 6.3 in //gfxip/gfx10/doc/architecture/ImageAddressing/gfx10_image_addressing.docx
+    // miptail is applied to only larger block size (4kb, 64kb, 256kb), so there is no miptail in linear and
+    // 256b_2d addressing since they are both 256b block.
+    BOOL_32 SupportsMipTail(Addr3SwizzleMode swizzleMode) const
+    {
+        return GetBlockSize(swizzleMode) > 256u;
+    }
+
+    UINT_32 ComputeOffsetFromEquation(
+        const ADDR_EQUATION* pEq,
+        UINT_32              x,
+        UINT_32              y,
+        UINT_32              z,
+        UINT_32              s) const;
+
+    const ADDR_SW_PATINFO* GetSwizzlePatternInfo(
+        Addr3SwizzleMode swizzleMode,
+        UINT_32          log2Elem,
+        UINT_32          numFrag) const;
+
+    VOID GetMipOffset(
+         const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,
+         ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut) const;
+
+    VOID GetMipOrigin(
+         const ADDR3_COMPUTE_SURFACE_INFO_INPUT* pIn,
+         const ADDR_EXTENT3D&                    mipExtentFirstInTail,
+         ADDR3_COMPUTE_SURFACE_INFO_OUTPUT*      pOut) const;
+};
+
+} // V3
+} // Addr
+} // namespace rocr
+#endif
diff --git a/src/image/addrlib/src/gfx9/gfx9addrlib.cpp b/src/image/addrlib/src/gfx9/gfx9addrlib.cpp
index 3b62d2d78..d98fd8058 100644
--- a/src/image/addrlib/src/gfx9/gfx9addrlib.cpp
+++ b/src/image/addrlib/src/gfx9/gfx9addrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -38,9 +21,9 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ************************************************************************************************************************
@@ -363,6 +346,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeCmaskInfo(
 
     // Generate the CMASK address equation.
     pOut->equation.gfx9.num_bits = Min(32u, eq->getsize());
+    bool checked = false;
     for (unsigned b = 0; b < pOut->equation.gfx9.num_bits; b++) {
        CoordTerm &bit = (*eq)[b];
 
@@ -727,6 +711,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeDccInfo(
 
         // Generate the DCC address equation.
         pOut->equation.gfx9.num_bits = Min(32u, eq->getsize());
+        bool checked = false;
         for (unsigned b = 0; b < pOut->equation.gfx9.num_bits; b++) {
            CoordTerm &bit = (*eq)[b];
 
@@ -2409,6 +2394,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeBlock256Equation(
     ADDR_E_RETURNCODE ret = ADDR_OK;
 
     pEquation->numBits = 8;
+    pEquation->numBitComponents = 1;
 
     UINT_32 i = 0;
     for (; i < elementBytesLog2; i++)
@@ -2735,6 +2721,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeThinEquation(
             }
         }
 
+        FillEqBitComponents(pEquation);
         pEquation->numBits = blockSizeLog2;
     }
 
@@ -3012,6 +2999,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlComputeThickEquation(
             }
         }
 
+        FillEqBitComponents(pEquation);
         pEquation->numBits = blockSizeLog2;
     }
 
@@ -3722,7 +3710,9 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting(
             // Apply optional restrictions
             if (pIn->flags.needEquation)
             {
-                FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3));
+                UINT_32 components = pIn->flags.allowExtEquation ?  ADDR_MAX_EQUATION_COMP :
+                                                                    ADDR_MAX_LEGACY_EQUATION_COMP;
+                FilterInvalidEqSwizzleMode(allowedSwModeSet, pIn->resourceType, Log2(bpp >> 3), components);
             }
 
             if (allowedSwModeSet.value == Gfx9LinearSwModeMask)
@@ -3763,6 +3753,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting(
 
                     const UINT_32 ratioLow           = computeMinSize ? 1 : (pIn->flags.opt4space ? 3 : 2);
                     const UINT_32 ratioHi            = computeMinSize ? 1 : (pIn->flags.opt4space ? 2 : 1);
+                    const UINT_64 sizeAlignInElement = Max(NextPow2(pIn->minSizeAlign) / (bpp >> 3), 1u);
                     UINT_32       minSizeBlk         = AddrBlockMicro;
                     UINT_64       minSize            = 0;
 
@@ -3770,7 +3761,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting(
 
                     for (UINT_32 i = AddrBlockLinear; i < AddrBlockMaxTiledType; i++)
                     {
-                        if (IsBlockTypeAvaiable(allowedBlockSet, static_cast<AddrBlockType>(i)))
+                        if (Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<rocr::AddrBlockType>(i)))
                         {
                             localIn.swizzleMode = swMode[i];
 
@@ -3788,7 +3779,7 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting(
                                 padSize[i] = localOut.surfSize;
 
                                 if ((minSize == 0) ||
-                                    BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
+                                    Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], ratioLow, ratioHi))
                                 {
                                     minSize    = padSize[i];
                                     minSizeBlk = i;
@@ -3829,9 +3820,9 @@ ADDR_E_RETURNCODE Gfx9Lib::HwlGetPreferredSurfaceSetting(
                         for (UINT_32 i = AddrBlockMicro; i < AddrBlockMaxTiledType; i++)
                         {
                             if ((i != minSizeBlk) &&
-                                IsBlockTypeAvaiable(allowedBlockSet, static_cast<AddrBlockType>(i)))
+                                Addr2IsBlockTypeAvailable(allowedBlockSet, static_cast<rocr::AddrBlockType>(i)))
                             {
-                                if (BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
+                                if (Addr2BlockTypeWithinMemoryBudget(minSize, padSize[i], 0, 0, pIn->memoryBudget) == FALSE)
                                 {
                                     // Clear the block type if the memory waste is unacceptable
                                     allowedBlockSet.value &= ~(1u << (i - 1));
@@ -5227,4 +5218,4 @@ VOID Gfx9Lib::ComputeThinBlockDimension(
 
 } // V2
 } // Addr
-} // rocr
\ No newline at end of file
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/gfx9/gfx9addrlib.h b/src/image/addrlib/src/gfx9/gfx9addrlib.h
index 10ea35139..990a55db2 100644
--- a/src/image/addrlib/src/gfx9/gfx9addrlib.h
+++ b/src/image/addrlib/src/gfx9/gfx9addrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -37,8 +20,10 @@
 #include "coord.h"
 
 namespace rocr {
-namespace Addr {
-namespace V2 {
+namespace Addr
+{
+namespace V2
+{
 
 /**
 ************************************************************************************************************************
@@ -647,7 +632,6 @@ class Gfx9Lib : public Lib
 
 } // V2
 } // Addr
-} // rocr
-
+} // namespace rocr
 #endif
 
diff --git a/src/image/addrlib/src/r800/ciaddrlib.cpp b/src/image/addrlib/src/r800/ciaddrlib.cpp
index 200f71589..81f39a239 100644
--- a/src/image/addrlib/src/r800/ciaddrlib.cpp
+++ b/src/image/addrlib/src/r800/ciaddrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -39,7 +22,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ****************************************************************************************************
@@ -2351,4 +2335,4 @@ BOOL_32 CiLib::CheckTcCompatibility(
 
 } // V1
 } // Addr
-} // rocr
\ No newline at end of file
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/r800/ciaddrlib.h b/src/image/addrlib/src/r800/ciaddrlib.h
index 894ddd321..997f0ba41 100644
--- a/src/image/addrlib/src/r800/ciaddrlib.h
+++ b/src/image/addrlib/src/r800/ciaddrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -37,8 +20,10 @@
 #include "siaddrlib.h"
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
+namespace V1
+{
 
 /**
 ****************************************************************************************************
@@ -204,5 +189,7 @@ class CiLib : public SiLib
 
 } // V1
 } // Addr
-} // rocr
+} // namespace rocr
 #endif
+
+
diff --git a/src/image/addrlib/src/r800/egbaddrlib.cpp b/src/image/addrlib/src/r800/egbaddrlib.cpp
index c762ab934..ee9a0a9eb 100644
--- a/src/image/addrlib/src/r800/egbaddrlib.cpp
+++ b/src/image/addrlib/src/r800/egbaddrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 /**
@@ -32,8 +15,10 @@
 #include "egbaddrlib.h"
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
+namespace V1
+{
 
 /**
 ****************************************************************************************************
@@ -1558,6 +1543,8 @@ ADDR_E_RETURNCODE EgBasedLib::ComputeMacroTileEquation(
                     pEquation->xor2[bankBitStart + i] = equation.xor2[i];
                     pEquation->numBits++;
                 }
+
+                FillEqBitComponents(pEquation);
             }
         }
     }
@@ -3133,6 +3120,7 @@ UINT_32 EgBasedLib::ComputePipeRotation(
 }
 
 
+
 /**
 ****************************************************************************************************
 *   EgBasedLib::ComputeBankRotation
@@ -4156,4 +4144,4 @@ UINT_32 EgBasedLib::HwlStereoCheckRightOffsetPadding(
 
 } // V1
 } // Addr
-} // rocr
+} // namespace rocr
\ No newline at end of file
diff --git a/src/image/addrlib/src/r800/egbaddrlib.h b/src/image/addrlib/src/r800/egbaddrlib.h
index ebae1ad6d..4a203c7d3 100644
--- a/src/image/addrlib/src/r800/egbaddrlib.h
+++ b/src/image/addrlib/src/r800/egbaddrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -36,8 +19,10 @@
 #include "addrlib1.h"
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
+namespace V1
+{
 /// Structures for functions
 struct CoordFromBankPipe
 {
@@ -423,7 +408,6 @@ class EgBasedLib : public Lib
 
 } // V1
 } // Addr
-} // rocr
-
+} // namespace rocr
 #endif
 
diff --git a/src/image/addrlib/src/r800/siaddrlib.cpp b/src/image/addrlib/src/r800/siaddrlib.cpp
index cb3760559..4abbed2b9 100644
--- a/src/image/addrlib/src/r800/siaddrlib.cpp
+++ b/src/image/addrlib/src/r800/siaddrlib.cpp
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -38,7 +21,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 namespace rocr {
-namespace Addr {
+namespace Addr
+{
 
 /**
 ****************************************************************************************************
@@ -419,6 +403,7 @@ ADDR_E_RETURNCODE SiLib::ComputeBankEquation(
             }
         }
     }
+    FillEqBitComponents(pEquation);
 
     if ((pTileInfo->bankWidth == 1) &&
         ((pTileInfo->pipeConfig == ADDR_PIPECFG_P4_32x32) ||
@@ -1661,7 +1646,9 @@ UINT_32 SiLib::HwlGetPitchAlignmentLinear(
     }
     else
     {
-        pitchAlign = Max(8u, 64 / BITS_TO_BYTES(bpp));
+        {
+            pitchAlign = Max(8u, 64 / BITS_TO_BYTES(bpp));
+        }
     }
 
     return pitchAlign;
@@ -2279,7 +2266,10 @@ BOOL_32 SiLib::DecodeGbRegs(
 
     reg.val = pRegValue->gbAddrConfig;
 
-    switch (reg.f.pipe_interleave_size)
+    UINT_32 pipe_interleave_size = reg.f.pipe_interleave_size;
+    UINT_32 row_size             = reg.f.row_size;
+
+    switch (pipe_interleave_size)
     {
         case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
             m_pipeInterleaveBytes = ADDR_PIPEINTERLEAVE_256B;
@@ -2293,7 +2283,7 @@ BOOL_32 SiLib::DecodeGbRegs(
             break;
     }
 
-    switch (reg.f.row_size)
+    switch (row_size)
     {
         case ADDR_CONFIG_1KB_ROW:
             m_rowSize = ADDR_ROWSIZE_1KB;
@@ -3869,4 +3859,4 @@ BOOL_32 SiLib::IsEquationSupported(
 
 } // V1
 } // Addr
-} // rocr
+} // namespace rocr
diff --git a/src/image/addrlib/src/r800/siaddrlib.h b/src/image/addrlib/src/r800/siaddrlib.h
index d5f23d80a..c8de9b904 100644
--- a/src/image/addrlib/src/r800/siaddrlib.h
+++ b/src/image/addrlib/src/r800/siaddrlib.h
@@ -2,24 +2,7 @@
 ************************************************************************************************************************
 *
 *  Copyright (C) 2007-2022 Advanced Micro Devices, Inc.  All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-* OTHER DEALINGS IN THE SOFTWARE
+*  SPDX-License-Identifier: MIT
 *
 ***********************************************************************************************************************/
 
@@ -37,8 +20,10 @@
 #include "egbaddrlib.h"
 
 namespace rocr {
-namespace Addr {
-namespace V1 {
+namespace Addr
+{
+namespace V1
+{
 
 /**
 ****************************************************************************************************
@@ -84,8 +69,11 @@ struct SiChipSettings
     UINT_32 isPolaris10       : 1;
     UINT_32 isPolaris11       : 1;
     UINT_32 isPolaris12       : 1;
+    // VI fusion
     UINT_32 isVegaM           : 1;
     UINT_32 isCarrizo         : 1;
+
+    UINT_32                   : 2;
 };
 
 /**
@@ -339,6 +327,6 @@ class SiLib : public EgBasedLib
 
 } // V1
 } // Addr
-} // rocr
+} // namespace rocr
 #endif
 
diff --git a/src/image/blit_kernel.cpp b/src/image/blit_kernel.cpp
index afbafed66..5b6d9cfae 100644
--- a/src/image/blit_kernel.cpp
+++ b/src/image/blit_kernel.cpp
@@ -105,6 +105,8 @@ extern uint8_t ocl_blit_object_gfx1102[];
 extern uint8_t ocl_blit_object_gfx1103[];
 extern uint8_t ocl_blit_object_gfx1150[];
 extern uint8_t ocl_blit_object_gfx1151[];
+extern uint8_t ocl_blit_object_gfx1200[];
+extern uint8_t ocl_blit_object_gfx1201[];
 
 // Arguments inserted by OCL compiler, all zero here.
 struct OCLHiddenArgs {
@@ -1052,6 +1054,10 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
     *blit_code_object = ocl_blit_object_gfx1150;
   } else if (sname == "gfx1151") {
     *blit_code_object = ocl_blit_object_gfx1151;
+  } else if (sname == "gfx1200") {
+    *blit_code_object = ocl_blit_object_gfx1200;
+  } else if (sname == "gfx1201") {
+    *blit_code_object = ocl_blit_object_gfx1201;
   } else {
     return HSA_STATUS_ERROR_INVALID_ISA_NAME;
   }
diff --git a/src/image/blit_src/CMakeLists.txt b/src/image/blit_src/CMakeLists.txt
index 481adf81a..94ba26267 100644
--- a/src/image/blit_src/CMakeLists.txt
+++ b/src/image/blit_src/CMakeLists.txt
@@ -50,7 +50,7 @@ if (NOT DEFINED TARGET_DEVICES)
   set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810"
                       "gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx940;gfx941;gfx942"
                       "gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
-                      "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151")
+                      "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1200;gfx1201")
 endif()
 set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )
 
diff --git a/src/image/image_manager_gfx11.cpp b/src/image/image_manager_gfx11.cpp
index 20a9eeaf1..427dab386 100644
--- a/src/image/image_manager_gfx11.cpp
+++ b/src/image/image_manager_gfx11.cpp
@@ -703,11 +703,6 @@ uint32_t ImageManagerGfx11::GetAddrlibSurfaceInfoNv(
       prefSettingsInput.forbiddenBlock.macroThick64KB = 1;
       prefSettingsInput.forbiddenBlock.micro = 1;
       prefSettingsInput.forbiddenBlock.var = 1;
-  } else {
-      // Debug setting, simplifies buffer alignment until language runtimes have official gfx10
-      // support.
-      prefSettingsInput.forbiddenBlock.macroThin64KB = 1;
-      prefSettingsInput.forbiddenBlock.macroThick64KB = 1;
   }
 
   // but don't ever allow the 256b swizzle modes
diff --git a/src/image/image_manager_gfx12.cpp b/src/image/image_manager_gfx12.cpp
new file mode 100644
index 000000000..14a37b8fe
--- /dev/null
+++ b/src/image/image_manager_gfx12.cpp
@@ -0,0 +1,896 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#define NOMINMAX
+#include "image_manager_gfx12.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <climits>
+
+#include "inc/hsa_ext_amd.h"
+#include "core/inc/hsa_internal.h"
+#include "core/util/utils.h"
+#include "addrlib/src/core/addrlib.h"
+#include "image_runtime.h"
+#include "resource.h"
+#include "resource_gfx12.h"
+#include "util.h"
+#include "device_info.h"
+
+namespace rocr {
+namespace image {
+
+static_assert(sizeof(SQ_BUF_RSRC_WORD0) == sizeof(uint32_t));
+static_assert(sizeof(SQ_BUF_RSRC_WORD1) == sizeof(uint32_t));
+static_assert(sizeof(SQ_BUF_RSRC_WORD2) == sizeof(uint32_t));
+static_assert(sizeof(SQ_BUF_RSRC_WORD3) == sizeof(uint32_t));
+
+static_assert(sizeof(SQ_IMG_RSRC_WORD0) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD1) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD2) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD3) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD4) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD5) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD6) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_RSRC_WORD7) == sizeof(uint32_t));
+
+static_assert(sizeof(SQ_IMG_SAMP_WORD0) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_SAMP_WORD1) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_SAMP_WORD2) == sizeof(uint32_t));
+static_assert(sizeof(SQ_IMG_SAMP_WORD3) == sizeof(uint32_t));
+
+//-----------------------------------------------------------------------------
+// Workaround switch to combined format/type codes and missing gfx11
+// specific look up table.  Only covers types used in image_lut_gfx11.cpp.
+//-----------------------------------------------------------------------------
+struct formatconverstion_t {
+  FMT fmt;
+  type type;
+  FORMAT format;
+};
+
+// Format/Type to combined format code table.
+// Sorted and indexed to allow fast searches.
+static const formatconverstion_t FormatLUT[] = {
+    {FMT_1_5_5_5, TYPE_UNORM, CFMT_1_5_5_5_UNORM},              // 0
+    {FMT_10_10_10_2, TYPE_UNORM, CFMT_10_10_10_2_UNORM},        // 1
+    {FMT_10_10_10_2, TYPE_SNORM, CFMT_10_10_10_2_SNORM},        // 2
+    {FMT_10_10_10_2, TYPE_UINT, CFMT_10_10_10_2_UINT},          // 3
+    {FMT_10_10_10_2, TYPE_SINT, CFMT_10_10_10_2_SINT},          // 4
+    {FMT_16, TYPE_UNORM, CFMT_16_UNORM},                        // 5
+    {FMT_16, TYPE_SNORM, CFMT_16_SNORM},                        // 6
+    {FMT_16, TYPE_UINT, CFMT_16_UINT},                          // 7
+    {FMT_16, TYPE_SINT, CFMT_16_SINT},                          // 8
+    {FMT_16, TYPE_FLOAT, CFMT_16_FLOAT},                        // 9
+    {FMT_16, TYPE_USCALED, CFMT_16_USCALED},                    // 10
+    {FMT_16, TYPE_SSCALED, CFMT_16_SSCALED},                    // 11
+    {FMT_16_16, TYPE_UNORM, CFMT_16_16_UNORM},                  // 12
+    {FMT_16_16, TYPE_SNORM, CFMT_16_16_SNORM},                  // 13
+    {FMT_16_16, TYPE_UINT, CFMT_16_16_UINT},                    // 14
+    {FMT_16_16, TYPE_SINT, CFMT_16_16_SINT},                    // 15
+    {FMT_16_16, TYPE_FLOAT, CFMT_16_16_FLOAT},                  // 16
+    {FMT_16_16, TYPE_USCALED, CFMT_16_16_USCALED},              // 17
+    {FMT_16_16, TYPE_SSCALED, CFMT_16_16_SSCALED},              // 18
+    {FMT_16_16_16_16, TYPE_UNORM, CFMT_16_16_16_16_UNORM},      // 19
+    {FMT_16_16_16_16, TYPE_SNORM, CFMT_16_16_16_16_SNORM},      // 20
+    {FMT_16_16_16_16, TYPE_UINT, CFMT_16_16_16_16_UINT},        // 21
+    {FMT_16_16_16_16, TYPE_SINT, CFMT_16_16_16_16_SINT},        // 22
+    {FMT_16_16_16_16, TYPE_FLOAT, CFMT_16_16_16_16_FLOAT},      // 23
+    {FMT_16_16_16_16, TYPE_USCALED, CFMT_16_16_16_16_USCALED},  // 24
+    {FMT_16_16_16_16, TYPE_SSCALED, CFMT_16_16_16_16_SSCALED},  // 25
+    {FMT_2_10_10_10, TYPE_UNORM, CFMT_2_10_10_10_UNORM},        // 26
+    {FMT_2_10_10_10, TYPE_SNORM, CFMT_2_10_10_10_SNORM},        // 27
+    {FMT_2_10_10_10, TYPE_UINT, CFMT_2_10_10_10_UINT},          // 28
+    {FMT_2_10_10_10, TYPE_SINT, CFMT_2_10_10_10_SINT},          // 29
+    {FMT_2_10_10_10, TYPE_USCALED, CFMT_2_10_10_10_USCALED},    // 30
+    {FMT_2_10_10_10, TYPE_SSCALED, CFMT_2_10_10_10_SSCALED},    // 31
+    {FMT_24_8, TYPE_UNORM, CFMT_24_8_UNORM},                    // 32
+    {FMT_24_8, TYPE_UINT, CFMT_24_8_UINT},                      // 33
+    {FMT_32, TYPE_UINT, CFMT_32_UINT},                          // 34
+    {FMT_32, TYPE_SINT, CFMT_32_SINT},                          // 35
+    {FMT_32, TYPE_FLOAT, CFMT_32_FLOAT},                        // 36
+    {FMT_32_32, TYPE_UINT, CFMT_32_32_UINT},                    // 37
+    {FMT_32_32, TYPE_SINT, CFMT_32_32_SINT},                    // 38
+    {FMT_32_32, TYPE_FLOAT, CFMT_32_32_FLOAT},                  // 39
+    {FMT_32_32_32, TYPE_UINT, CFMT_32_32_32_UINT},              // 40
+    {FMT_32_32_32, TYPE_SINT, CFMT_32_32_32_SINT},              // 41
+    {FMT_32_32_32, TYPE_FLOAT, CFMT_32_32_32_FLOAT},            // 42
+    {FMT_32_32_32_32, TYPE_UINT, CFMT_32_32_32_32_UINT},        // 43
+    {FMT_32_32_32_32, TYPE_SINT, CFMT_32_32_32_32_SINT},        // 44
+    {FMT_32_32_32_32, TYPE_FLOAT, CFMT_32_32_32_32_FLOAT},      // 45
+    {FMT_5_5_5_1, TYPE_UNORM, CFMT_5_5_5_1_UNORM},              // 46
+    {FMT_5_6_5, TYPE_UNORM, CFMT_5_6_5_UNORM},                  // 47
+    {FMT_8, TYPE_UNORM, CFMT_8_UNORM},                          // 48
+    {FMT_8, TYPE_SNORM, CFMT_8_SNORM},                          // 49
+    {FMT_8, TYPE_UINT, CFMT_8_UINT},                            // 50
+    {FMT_8, TYPE_SINT, CFMT_8_SINT},                            // 51
+    {FMT_8, TYPE_SRGB, CFMT_8_SRGB},                            // 52
+    {FMT_8, TYPE_USCALED, CFMT_8_USCALED},                      // 53
+    {FMT_8, TYPE_SSCALED, CFMT_8_SSCALED},                      // 54
+    {FMT_8_24, TYPE_UNORM, CFMT_8_24_UNORM},                    // 55
+    {FMT_8_24, TYPE_UINT, CFMT_8_24_UINT},                      // 56
+    {FMT_8_8, TYPE_UNORM, CFMT_8_8_UNORM},                      // 57
+    {FMT_8_8, TYPE_SNORM, CFMT_8_8_SNORM},                      // 58
+    {FMT_8_8, TYPE_UINT, CFMT_8_8_UINT},                        // 59
+    {FMT_8_8, TYPE_SINT, CFMT_8_8_SINT},                        // 60
+    {FMT_8_8, TYPE_SRGB, CFMT_8_8_SRGB},                        // 61
+    {FMT_8_8, TYPE_USCALED, CFMT_8_8_USCALED},                  // 62
+    {FMT_8_8, TYPE_SSCALED, CFMT_8_8_SSCALED},                  // 63
+    {FMT_8_8_8_8, TYPE_UNORM, CFMT_8_8_8_8_UNORM},              // 64
+    {FMT_8_8_8_8, TYPE_SNORM, CFMT_8_8_8_8_SNORM},              // 65
+    {FMT_8_8_8_8, TYPE_UINT, CFMT_8_8_8_8_UINT},                // 66
+    {FMT_8_8_8_8, TYPE_SINT, CFMT_8_8_8_8_SINT},                // 67
+    {FMT_8_8_8_8, TYPE_SRGB, CFMT_8_8_8_8_SRGB},                // 68
+    {FMT_8_8_8_8, TYPE_USCALED, CFMT_8_8_8_8_USCALED},          // 69
+    {FMT_8_8_8_8, TYPE_SSCALED, CFMT_8_8_8_8_SSCALED}           // 70
+};
+static const int FormatLUTSize = sizeof(FormatLUT)/sizeof(formatconverstion_t);
+
+//Index in FormatLUT to start search, indexed by FMT enum.
+static const int FormatEntryPoint[] = {
+  71, // FMT_INVALID
+  48, // FMT_8
+  5,  // FMT_16
+  57, // FMT_8_8
+  34, // FMT_32
+  12, // FMT_16_16
+  71, // FMT_10_11_11
+  71, // FMT_11_11_10
+  1,  // FMT_10_10_10_2
+  26, // FMT_2_10_10_10
+  64, // FMT_8_8_8_8
+  37, // FMT_32_32
+  19, // FMT_16_16_16_16
+  40, // FMT_32_32_32
+  43, // FMT_32_32_32_32
+  71, // RESERVED
+  47, // FMT_5_6_5
+  0,  // FMT_1_5_5_5
+  46, // FMT_5_5_5_1
+  71, // FMT_4_4_4_4
+  55, // FMT_8_24
+  32  // FMT_24_8
+};
+
+static FORMAT GetCombinedFormat(uint8_t fmt, uint8_t type) {
+  assert(fmt < sizeof(FormatEntryPoint)/sizeof(int) && "FMT out of range.");
+  int start = FormatEntryPoint[fmt];
+  int stop = std::min(start + 6, FormatLUTSize); // Only 6 types are used in image_kv_lut.cpp
+
+  for(int i=start; i<stop; i++) {
+    if((FormatLUT[i].fmt == fmt) && (FormatLUT[i].type == type))
+      return FormatLUT[i].format;
+  }
+  return CFMT_INVALID;
+};
+//-----------------------------------------------------------------------------
+// End workaround
+//-----------------------------------------------------------------------------
+
+ImageManagerGfx12::ImageManagerGfx12() : ImageManagerKv() {}
+
+ImageManagerGfx12::~ImageManagerGfx12() {}
+
+// TODO(cfreehil) remove from class, make it a utility function
+hsa_status_t ImageManagerGfx12::CalculateImageSizeAndAlignment(
+    hsa_agent_t component, const hsa_ext_image_descriptor_t& desc,
+    hsa_ext_image_data_layout_t image_data_layout,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    hsa_ext_image_data_info_t& image_info) const {
+  ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
+  hsa_profile_t profile;
+
+  hsa_status_t status = HSA::hsa_agent_get_info(component, HSA_AGENT_INFO_PROFILE, &profile);
+  if (status != HSA_STATUS_SUCCESS) return status;
+
+  Image::TileMode tileMode = Image::TileMode::LINEAR;
+  if (image_data_layout == HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE) {
+    tileMode = (profile == HSA_PROFILE_BASE &&
+                desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB)?
+      Image::TileMode::TILED : Image::TileMode::LINEAR;
+  }
+  if (GetAddrlibSurfaceInfoNv(component, desc, tileMode,
+        image_data_row_pitch, image_data_slice_pitch, out) ==
+                                                             (uint32_t)(-1)) {
+    return HSA_STATUS_ERROR;
+  }
+
+  size_t rowPitch   = (out.bpp >> 3) * out.pitch;
+  size_t slicePitch = rowPitch * out.height;
+  if (desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB &&
+      image_data_layout == HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR &&
+      ((image_data_row_pitch && (rowPitch != image_data_row_pitch)) ||
+       (image_data_slice_pitch && (slicePitch != image_data_slice_pitch)))) {
+    return static_cast<hsa_status_t>(
+                                HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED);
+  }
+
+  image_info.size = out.surfSize;
+  assert(image_info.size != 0);
+  image_info.alignment = out.baseAlign;
+  assert(image_info.alignment != 0);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+bool ImageManagerGfx12::IsLocalMemory(const void* address) const {
+  return true;
+}
+
+hsa_status_t ImageManagerGfx12::PopulateImageSrd(Image& image,
+                                     const metadata_amd_t* descriptor) const {
+  const metadata_amd_gfx12_t* desc = reinterpret_cast<const metadata_amd_gfx12_t*>(descriptor);
+  const void* image_data_addr = image.data;
+
+  ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry);
+  if ((image_prop.cap == HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED) ||
+     (image_prop.element_size == 0))
+    return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED;
+
+  const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order);
+
+  if (IsLocalMemory(image.data)) {
+    image_data_addr = reinterpret_cast<const void*>(
+        reinterpret_cast<uintptr_t>(image.data) - local_memory_base_address_);
+  }
+
+  image.srd[0] = desc->word0.u32All;
+  image.srd[1] = desc->word1.u32All;
+  image.srd[2] = desc->word2.u32All;
+  image.srd[3] = desc->word3.u32All;
+  image.srd[4] = desc->word4.u32All;
+  image.srd[5] = desc->word5.u32All;
+  image.srd[6] = desc->word6.u32All;
+  image.srd[7] = desc->word7.u32All;
+
+  if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
+    SQ_BUF_RSRC_WORD0 word0;
+    SQ_BUF_RSRC_WORD1 word1;
+    SQ_BUF_RSRC_WORD3 word3;
+
+    word0.val = 0;
+    word0.f.BASE_ADDRESS = PtrLow32(image_data_addr);
+
+    word1.val = image.srd[1];
+    word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr);
+    word1.f.STRIDE = image_prop.element_size;
+
+    word3.val = image.srd[3];
+    word3.f.DST_SEL_X = swizzle.x;
+    word3.f.DST_SEL_Y = swizzle.y;
+    word3.f.DST_SEL_Z = swizzle.z;
+    word3.f.DST_SEL_W = swizzle.w;
+
+    word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
+
+    word3.f.INDEX_STRIDE = image_prop.element_size;
+
+    // New to GFX12
+    //word3.f.WRITE_COMPRESS_ENABLE = 0;
+    //word3.f.COMPRESSION_EN = 0;
+    //word3.f.COMPRESSION_ACCESS_MODE = 0;
+
+    image.srd[0] = word0.val;
+    image.srd[1] = word1.val;
+    image.srd[3] = word3.val;
+  } else {
+    uint32_t hwPixelSize = ImageLut().GetPixelSize(image_prop.data_format, image_prop.data_type);
+
+    if (image_prop.element_size != hwPixelSize) {
+      return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED;
+    }
+    reinterpret_cast<SQ_IMG_RSRC_WORD0*>(&image.srd[0])->bits.BASE_ADDRESS =
+        PtrLow40Shift8(image_data_addr);
+    reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1])->bits.BASE_ADDRESS_HI =
+        PtrHigh64Shift40(image_data_addr);
+
+    // New to GFX12...
+    //reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1])->bits.MAX_MIP = 0;
+
+    reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1])->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
+    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_X =
+                                                                    swizzle.x;
+    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_Y =
+                                                                    swizzle.y;
+    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_Z =
+                                                                    swizzle.z;
+    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_W =
+                                                                    swizzle.w;
+    if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA ||
+        image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1D) {
+      reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.TYPE =
+          ImageLut().MapGeometry(image.desc.geometry);
+    }
+  }
+
+  // Looks like this is only used for CPU copies.
+  image.row_pitch = 0;
+  image.slice_pitch = 0;
+
+  // Used by HSAIL shader ABI
+  image.srd[8] = image.desc.format.channel_type;
+  image.srd[9] = image.desc.format.channel_order;
+  image.srd[10] = static_cast<uint32_t>(image.desc.width);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+static TEX_BC_SWIZZLE GetBcSwizzle(const Swizzle& swizzle) {
+    SEL r = (SEL)swizzle.x;
+    SEL g = (SEL)swizzle.y;
+    SEL b = (SEL)swizzle.z;
+    SEL a = (SEL)swizzle.w;
+
+    TEX_BC_SWIZZLE bcSwizzle = TEX_BC_Swizzle_XYZW;
+
+    if (a == SEL_X) {
+        // Have to use either TEX_BC_Swizzle_WZYX or TEX_BC_Swizzle_WXYZ
+        //
+        // For the pre-defined border color values (white, opaque black,
+        // transparent black), the only thing that matters is that the alpha
+        // channel winds up in the correct place (because the RGB channels are
+        // all the same) so either of these TEX_BC_Swizzle enumerations will
+        // work.  Not sure what happens with border color palettes.
+        if (b == SEL_Y) {
+            // ABGR
+            bcSwizzle = TEX_BC_Swizzle_WZYX;
+        } else if ((r == SEL_X) && (g == SEL_X) && (b == SEL_X)) {
+            // RGBA
+            bcSwizzle = TEX_BC_Swizzle_XYZW;
+        } else {
+            // ARGB
+            bcSwizzle = TEX_BC_Swizzle_WXYZ;
+        }
+    } else if (r == SEL_X) {
+        // Have to use either TEX_BC_Swizzle_XYZW or TEX_BC_Swizzle_XWYZ
+        if (g == SEL_Y) {
+            // RGBA
+            bcSwizzle = TEX_BC_Swizzle_XYZW;
+        } else if ((g == SEL_X) && (b == SEL_X) && (a == SEL_W)) {
+            // RGBA
+            bcSwizzle = TEX_BC_Swizzle_XYZW;
+        } else {
+            // RAGB
+            bcSwizzle = TEX_BC_Swizzle_XWYZ;
+        }
+    } else if (g == SEL_X) {
+        // GRAB, have to use TEX_BC_Swizzle_YXWZ
+        bcSwizzle = TEX_BC_Swizzle_YXWZ;
+    } else if (b == SEL_X) {
+        // BGRA, have to use TEX_BC_Swizzle_ZYXW
+        bcSwizzle = TEX_BC_Swizzle_ZYXW;
+    }
+
+    return bcSwizzle;
+}
+
+
+hsa_status_t ImageManagerGfx12::PopulateImageSrd(Image& image) const {
+  ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry);
+  assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED);
+  assert(image_prop.element_size != 0);
+
+  const void* image_data_addr = image.data;
+
+  if (IsLocalMemory(image.data))
+    image_data_addr = reinterpret_cast<const void*>(
+        reinterpret_cast<uintptr_t>(image.data) - local_memory_base_address_);
+
+  if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
+    SQ_BUF_RSRC_WORD0 word0;
+    SQ_BUF_RSRC_WORD1 word1;
+    SQ_BUF_RSRC_WORD2 word2;
+    SQ_BUF_RSRC_WORD3 word3;
+
+    word0.val = 0;
+    word0.f.BASE_ADDRESS = PtrLow32(image_data_addr);
+
+    word1.val = 0;
+    word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr);
+    word1.f.STRIDE = image_prop.element_size;
+
+    word1.f.SWIZZLE_ENABLE = 0;
+
+    word2.f.NUM_RECORDS = image.desc.width * image_prop.element_size;
+
+    const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order);
+    word3.val = 0;
+    word3.f.DST_SEL_X = swizzle.x;
+    word3.f.DST_SEL_Y = swizzle.y;
+    word3.f.DST_SEL_Z = swizzle.z;
+    word3.f.DST_SEL_W = swizzle.w;
+    word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
+
+    word3.f.INDEX_STRIDE = image_prop.element_size;
+
+    // New to GFX12
+    //word3.f.WRITE_COMPRESS_ENABLE = 0;
+    //word3.f.COMPRESSION_EN = 0;
+    //word3.f.COMPRESSION_ACCESS_MODE = 0;
+
+    word3.f.TYPE = ImageLut().MapGeometry(image.desc.geometry);
+
+    image.srd[0] = word0.val;
+    image.srd[1] = word1.val;
+    image.srd[2] = word2.val;
+    image.srd[3] = word3.val;
+
+    image.row_pitch = image.desc.width * image_prop.element_size;
+    image.slice_pitch = image.row_pitch;
+  } else {
+    SQ_IMG_RSRC_WORD0 word0;
+    SQ_IMG_RSRC_WORD1 word1;
+    SQ_IMG_RSRC_WORD2 word2;
+    SQ_IMG_RSRC_WORD3 word3;
+    SQ_IMG_RSRC_WORD4 word4;
+    SQ_IMG_RSRC_WORD5 word5;
+    SQ_IMG_RSRC_WORD5 word6;
+    SQ_IMG_RSRC_WORD5 word7;
+
+    ADDR3_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
+
+    uint32_t swizzleMode = GetAddrlibSurfaceInfoNv(
+         image.component, image.desc, image.tile_mode,
+                                     image.row_pitch, image.slice_pitch, out);
+    if (swizzleMode == (uint32_t)(-1)) {
+      return HSA_STATUS_ERROR;
+    }
+
+    assert((out.bpp / 8) == image_prop.element_size);
+
+    const size_t row_pitch_size = out.pitch * image_prop.element_size;
+
+    word0.f.BASE_ADDRESS = PtrLow40Shift8(image_data_addr);
+
+    word1.val = 0;
+    word1.f.BASE_ADDRESS_HI = PtrHigh64Shift40(image_data_addr);
+
+    // New to GFX12
+    //word1.f.MAX_MIP = 0;
+    //word1.f.BASE_LEVEL = 0;
+
+    word1.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
+    // Only take the lowest 2 bits of (image.desc.width - 1)
+    word1.f.WIDTH = BitSelect<0, 1>(image.desc.width - 1);
+
+    word2.val = 0;
+    // Take the high 14 bits of (image.desc.width - 1)
+    word2.f.WIDTH_HI = BitSelect<2, 15>(image.desc.width - 1);
+    word2.f.HEIGHT = image.desc.height ? image.desc.height - 1 : 0;
+
+    const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order);
+    word3.val = 0;
+    word3.f.DST_SEL_X = swizzle.x;
+    word3.f.DST_SEL_Y = swizzle.y;
+    word3.f.DST_SEL_Z = swizzle.z;
+    word3.f.DST_SEL_W = swizzle.w;
+    //word3.f.NO_EDGE_CLAMP = 0;  // New to GFX12
+    //word3.f.LAST_LEVEL = 0;     // New to GFX12
+    word3.f.SW_MODE = swizzleMode;
+    word3.f.BC_SWIZZLE = GetBcSwizzle(swizzle);
+    word3.f.TYPE = ImageLut().MapGeometry(image.desc.geometry);
+
+    const bool image_array =
+        (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA ||
+         image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DA ||
+         image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DADEPTH);
+    const bool image_3d = (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_3D);
+
+    word4.val = 0;
+    word4.f.DEPTH =
+        (image_array) // Doesn't hurt but isn't array_size already >0?
+            ? std::max(image.desc.array_size, static_cast<size_t>(1)) - 1
+            : (image_3d) ? image.desc.depth - 1 : 0;
+
+    // For 1d, 2d and 2d-msaa this is pitch-1
+    if (!image_array && !image_3d) {
+      uint32_t encPitch = out.pitch - 1;
+      word4.f.DEPTH = encPitch & 0x1fff;  // 13 bits
+      word4.f.PITCH_MSB = (encPitch >> 13) & 0x3;  // last 2 bits
+    }
+
+    word5.val = 0;
+    word6.val = 0;
+    word7.val = 0;
+
+    image.srd[0] = word0.val;
+    image.srd[1] = word1.val;
+    image.srd[2] = word2.val;
+    image.srd[3] = word3.val;
+    image.srd[4] = word4.val;
+    image.srd[5] = word5.val;
+    image.srd[6] = word6.val;
+    image.srd[7] = word7.val;
+
+    image.row_pitch = row_pitch_size;
+    image.slice_pitch = out.sliceSize;
+  }
+
+  image.srd[8] = image.desc.format.channel_type;
+  image.srd[9] = image.desc.format.channel_order;
+  image.srd[10] = static_cast<uint32_t>(image.desc.width);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ImageManagerGfx12::ModifyImageSrd(
+    Image& image, hsa_ext_image_format_t& new_format) const {
+  image.desc.format = new_format;
+
+  ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry);
+  assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED);
+  assert(image_prop.element_size != 0);
+
+  if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
+    const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order);
+    SQ_BUF_RSRC_WORD3* word3 =
+        reinterpret_cast<SQ_BUF_RSRC_WORD3*>(&image.srd[3]);
+    word3->bits.DST_SEL_X = swizzle.x;
+    word3->bits.DST_SEL_Y = swizzle.y;
+    word3->bits.DST_SEL_Z = swizzle.z;
+    word3->bits.DST_SEL_W = swizzle.w;
+    word3->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
+  } else {
+    SQ_IMG_RSRC_WORD1* word1 =
+        reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1]);
+    word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
+
+    const Swizzle swizzle = ImageLut().MapSwizzle(image.desc.format.channel_order);
+    SQ_IMG_RSRC_WORD3* word3 =
+        reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3]);
+    word3->bits.DST_SEL_X = swizzle.x;
+    word3->bits.DST_SEL_Y = swizzle.y;
+    word3->bits.DST_SEL_Z = swizzle.z;
+    word3->bits.DST_SEL_W = swizzle.w;
+  }
+
+  image.srd[8] = image.desc.format.channel_type;
+  image.srd[9] = image.desc.format.channel_order;
+  image.srd[10] = static_cast<uint32_t>(image.desc.width);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ImageManagerGfx12::PopulateSamplerSrd(Sampler& sampler) const {
+  const hsa_ext_sampler_descriptor_t sampler_descriptor = sampler.desc;
+
+  SQ_IMG_SAMP_WORD0 word0;
+  SQ_IMG_SAMP_WORD1 word1;
+  SQ_IMG_SAMP_WORD2 word2;
+  SQ_IMG_SAMP_WORD3 word3;
+
+  word0.u32All = 0;
+  switch (sampler_descriptor.address_mode) {
+    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
+      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_CLAMP_LAST_TEXEL);
+      break;
+    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER:
+      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_CLAMP_BORDER);
+      break;
+    case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
+      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_MIRROR);
+      break;
+    case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
+    case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:
+      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_WRAP);
+      break;
+    default:
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  word0.bits.CLAMP_Y = word0.bits.CLAMP_X;
+  word0.bits.CLAMP_Z = word0.bits.CLAMP_X;
+  word0.bits.FORCE_UNNORMALIZED = (sampler_descriptor.coordinate_mode ==
+                                  HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED);
+
+  word1.u32All = 0;
+  word1.bits.MAX_LOD = 4095;
+
+  word2.u32All = 0;
+  switch (sampler_descriptor.filter_mode) {
+    case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST:
+      word2.bits.XY_MAG_FILTER = static_cast<int>(SQ_TEX_XY_FILTER_POINT);
+      break;
+    case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:
+      word2.bits.XY_MAG_FILTER = static_cast<int>(SQ_TEX_XY_FILTER_BILINEAR);
+      break;
+    default:
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  word2.bits.XY_MIN_FILTER = word2.bits.XY_MAG_FILTER;
+  word2.bits.Z_FILTER = SQ_TEX_Z_FILTER_NONE;
+  word2.bits.MIP_FILTER = SQ_TEX_MIP_FILTER_NONE;
+
+  word3.u32All = 0;
+
+  // TODO: check this bit with HSAIL spec.
+  word3.bits.BORDER_COLOR_TYPE = SQ_TEX_BORDER_COLOR_TRANS_BLACK;
+
+  sampler.srd[0] = word0.u32All;
+  sampler.srd[1] = word1.u32All;
+  sampler.srd[2] = word2.u32All;
+  sampler.srd[3] = word3.u32All;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+uint32_t ImageManagerGfx12::GetAddrlibSurfaceInfoNv(
+    hsa_agent_t component, const hsa_ext_image_descriptor_t& desc,
+    Image::TileMode tileMode,
+    size_t image_data_row_pitch,
+    size_t image_data_slice_pitch,
+    ADDR3_COMPUTE_SURFACE_INFO_OUTPUT& out) const {
+  const ImageProperty image_prop =
+      GetImageProperty(component, desc.format, desc.geometry);
+
+  const AddrFormat addrlib_format = GetAddrlibFormat(image_prop);
+
+  const uint32_t width = static_cast<uint32_t>(desc.width);
+  const uint32_t height = static_cast<uint32_t>(desc.height);
+  static const size_t kMinNumSlice = 1;
+  const uint32_t num_slice = static_cast<uint32_t>(
+      std::max(kMinNumSlice, std::max(desc.array_size, desc.depth)));
+
+  ADDR3_COMPUTE_SURFACE_INFO_INPUT in = {0};
+  in.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_INPUT);
+  in.format = addrlib_format;
+  in.bpp = static_cast<unsigned int>(image_prop.element_size) * 8;
+  in.width = width;
+  in.height = height;
+  in.numSlices = num_slice;
+  in.pitchInElement = image_data_row_pitch / image_prop.element_size;
+
+  switch (desc.geometry) {
+    case HSA_EXT_IMAGE_GEOMETRY_1D:
+    case HSA_EXT_IMAGE_GEOMETRY_1DB:
+    case HSA_EXT_IMAGE_GEOMETRY_1DA:
+      in.resourceType = ADDR_RSRC_TEX_1D;
+      break;
+
+    case HSA_EXT_IMAGE_GEOMETRY_2D:
+    case HSA_EXT_IMAGE_GEOMETRY_2DDEPTH:
+    case HSA_EXT_IMAGE_GEOMETRY_2DA:
+    case HSA_EXT_IMAGE_GEOMETRY_2DADEPTH:
+      in.resourceType = ADDR_RSRC_TEX_2D;
+      break;
+
+    case HSA_EXT_IMAGE_GEOMETRY_3D:
+      in.resourceType = ADDR_RSRC_TEX_3D;
+      break;
+  }
+  in.flags.texture = 1;
+
+  if (tileMode == Image::TileMode::LINEAR)
+  {
+    in.swizzleMode = ADDR3_LINEAR;
+  } else {
+
+    /*
+     * AddrLib3 does not provide the best swizzle mode (unlike AddrLib2).
+     * Instead, client has to request the list of possible swizzle mode and
+     * then pick the best one for its needs (i.e. performance/space tradeoffs).
+     *
+     */
+    ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT swOut = { 0 };
+    swOut.size = sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_OUTPUT);
+
+    ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT swIn = { 0 };
+    swIn.size = sizeof(ADDR3_GET_POSSIBLE_SWIZZLE_MODE_INPUT);
+    swIn.flags = in.flags;
+    swIn.resourceType = in.resourceType;
+    swIn.bpp = in.bpp;
+    swIn.width = in.width;
+    swIn.height = in.height;
+    swIn.numSlices = in.numSlices;
+    swIn.numMipLevels = in.numMipLevels;
+    swIn.numSamples = in.numSamples;
+    /*
+     * Cannot leave it to 0 like GFX11 Addr2GetPreferredSurfaceSetting method
+     * as it triggers an ASSERT in AddrLib3 code.
+     *
+     * Setting it to 256K to allow for maximum number of swizzle mode in set
+     * returned (similar behaviour as GFX11).
+     *
+     */
+    swIn.maxAlign = 256 * 1024;
+
+
+    if (ADDR_OK != Addr3GetPossibleSwizzleModes(addr_lib_, &swIn, &swOut)) {
+      debug_print("Addr3GetPossibleSwizzleModes failed!\n");
+      return (uint32_t) -1;
+    }
+
+    /*
+     * Remove any modes that the client does not want (if any).
+     */
+    //swOut.validModes.sw***** = 0;
+
+
+    /*
+     * Pick the "best" swizzle mode.
+     *
+     * This algorithm is based on behaviour in GFX11 AddrLib and on
+     * GFX12 code in PAL (that is also based on the GFX11 behaviour).
+     *
+     * Ratio variables control the extra space that can be used to get a larger
+     * swizzle mode.
+     *
+     * ratioLow:ratioHi meanings:
+     *
+     *   2:1 ratio - same behaviour as GFX11.
+     *   3:2 ratio - would be equivalent if flag opt4space in GFX11 (not used in ROCr)
+     *   1:1 ratio - minimum size, not necessary best for performance
+     *
+     */
+    const UINT_32 ratioLow = 2;
+    const UINT_32 ratioHigh = 1;
+
+    // Same behaviour as GFX11, remove linear if height is 1.
+    if (in.height > 1) {
+      swOut.validModes.swLinear = 0;
+    }
+
+    UINT_64 minSize = 0;
+    Addr3SwizzleMode bestSwizzle = ADDR3_MAX_TYPE;
+
+    for (uint32_t i = ADDR3_LINEAR; i < ADDR3_MAX_TYPE; i++) {
+
+      if (swOut.validModes.value & (1 << i)) {
+        ADDR3_COMPUTE_SURFACE_INFO_OUTPUT localOut = {0};
+        localOut.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT);
+
+        in.swizzleMode = (Addr3SwizzleMode) i;
+
+        if (ADDR_OK != Addr3ComputeSurfaceInfo(addr_lib_, &in, &localOut)) {
+          // Should not happen, if it does, ignore this swizzle mode.
+          debug_print("Addr3ComputeSurfaceInfo failed!\n");
+          continue;
+        }
+
+        UINT_64 surfaceSize = localOut.surfSize;
+
+        if (bestSwizzle == ADDR3_MAX_TYPE) {
+          minSize = surfaceSize;
+          bestSwizzle = (Addr3SwizzleMode) i;
+        } else if ((surfaceSize * ratioHigh) <= (minSize * ratioLow)) {
+          bestSwizzle = (Addr3SwizzleMode) i;
+        }
+      }
+    }
+
+    if (bestSwizzle < ADDR3_MAX_TYPE) {
+      in.swizzleMode = (Addr3SwizzleMode) bestSwizzle;
+    } else {
+      debug_print("Unable to find a valid swizzleMode for the surface!\n");
+      return (uint32_t) -1;
+    }
+  }
+
+
+  out.size = sizeof(ADDR3_COMPUTE_SURFACE_INFO_OUTPUT);
+
+  if (ADDR_OK != Addr3ComputeSurfaceInfo(addr_lib_, &in, &out)) {
+    return (uint32_t)(-1);
+  }
+  if (out.surfSize == 0) {
+    return (uint32_t)(-1);
+  }
+
+  return in.swizzleMode;
+}
+
+hsa_status_t ImageManagerGfx12::FillImage(const Image& image, const void* pattern,
+                                       const hsa_ext_image_region_t& region) {
+  if (BlitQueueInit().queue_ == NULL) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  Image* image_view = const_cast<Image*>(&image);
+
+  SQ_BUF_RSRC_WORD3* word3_buff = NULL;
+  SQ_IMG_RSRC_WORD3* word3_image = NULL;
+  uint32_t dst_sel_w_original = 0;
+  if (image_view->desc.format.channel_type ==
+      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010) {
+    // Force GPU to ignore the last two bits (alpha bits).
+    if (image_view->desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
+      word3_buff = reinterpret_cast<SQ_BUF_RSRC_WORD3*>(&image_view->srd[3]);
+      dst_sel_w_original = word3_buff->bits.DST_SEL_W;
+      word3_buff->bits.DST_SEL_W = SEL_0;
+    } else {
+      word3_image = reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image_view->srd[3]);
+      dst_sel_w_original = word3_image->bits.DST_SEL_W;
+      word3_image->bits.DST_SEL_W = SEL_0;
+    }
+  }
+
+  SQ_IMG_RSRC_WORD1* word1 = NULL;
+  uint32_t num_format_original = 0;
+  const void* new_pattern = pattern;
+  float fill_value[4] = {0};
+  switch (image_view->desc.format.channel_order) {
+    case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA:
+    case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB:
+    case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX:
+    case HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA: {
+      // We do not have write support for SRGBA image, so convert pattern
+      // to standard form and treat the image as RGBA image.
+      const float* pattern_f = reinterpret_cast<const float*>(pattern);
+      fill_value[0] = LinearToStandardRGB(pattern_f[0]);
+      fill_value[1] = LinearToStandardRGB(pattern_f[1]);
+      fill_value[2] = LinearToStandardRGB(pattern_f[2]);
+      fill_value[3] = pattern_f[3];
+      new_pattern = fill_value;
+
+      ImageProperty image_prop = ImageLut().MapFormat(image.desc.format, image.desc.geometry);
+
+      word1 = reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image_view->srd[1]);
+      num_format_original = word1->bits.FORMAT;
+      word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, TYPE_UNORM);
+    } break;
+    default:
+      break;
+  }
+
+  hsa_status_t status = ImageRuntime::instance()->blit_kernel().FillImage(
+      blit_queue_, blit_code_catalog_, *image_view, new_pattern, region);
+
+  // Revert back original configuration.
+  if (word3_buff != NULL) {
+    word3_buff->bits.DST_SEL_W = dst_sel_w_original;
+  }
+
+  if (word3_image != NULL) {
+    word3_image->bits.DST_SEL_W = dst_sel_w_original;
+  }
+
+  if (word1 != NULL) {
+    word1->bits.FORMAT = num_format_original;
+  }
+
+  return status;
+}
+
+}  // namespace image
+}  // namespace rocr
diff --git a/src/image/image_manager_gfx12.h b/src/image/image_manager_gfx12.h
new file mode 100755
index 000000000..085dee9c9
--- /dev/null
+++ b/src/image/image_manager_gfx12.h
@@ -0,0 +1,101 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef EXT_IMAGE_IMAGE_MANAGER_GFX12_H_
+#define EXT_IMAGE_IMAGE_MANAGER_GFX12_H_
+
+#include "addrlib/inc/addrinterface.h"
+#include "image_lut_gfx11.h"
+#include "image_manager_kv.h"
+
+namespace rocr {
+namespace image {
+
+class ImageManagerGfx12 : public ImageManagerKv {
+ public:
+  ImageManagerGfx12();
+  virtual ~ImageManagerGfx12();
+
+  /// @brief Calculate the size and alignment of the backing storage of an
+  /// image.
+  virtual hsa_status_t CalculateImageSizeAndAlignment(
+      hsa_agent_t component, const hsa_ext_image_descriptor_t& desc,
+      hsa_ext_image_data_layout_t image_data_layout,
+      size_t image_data_row_pitch, size_t image_data_slice_pitch,
+      hsa_ext_image_data_info_t& image_info) const;
+
+  /// @brief Fill image structure with device specific image object.
+  virtual hsa_status_t PopulateImageSrd(Image& image) const;
+
+  /// @brief Fill image structure with device specific image object using the given format.
+  virtual hsa_status_t PopulateImageSrd(Image& image, const metadata_amd_t* desc) const;
+
+  /// @brief Modify device specific image object according to the specified
+  /// new format.
+  virtual hsa_status_t ModifyImageSrd(Image& image,
+                                      hsa_ext_image_format_t& new_format) const;
+
+  /// @brief Fill sampler structure with device specific sampler object.
+  virtual hsa_status_t PopulateSamplerSrd(Sampler& sampler) const;
+
+  /// @brief Fill image backing storage using agent copy.
+  virtual hsa_status_t FillImage(const Image& image, const void* pattern,
+                                 const hsa_ext_image_region_t& region);
+ protected:
+  uint32_t GetAddrlibSurfaceInfoNv(hsa_agent_t component,
+                             const hsa_ext_image_descriptor_t& desc,
+                             Image::TileMode tileMode,
+                             size_t image_data_row_pitch,
+                             size_t image_data_slice_pitch,
+                             ADDR3_COMPUTE_SURFACE_INFO_OUTPUT& out) const;
+
+  bool IsLocalMemory(const void* address) const;
+  virtual const ImageLutGfx11& ImageLut() const { return image_lut_gfx11; };
+
+ private:
+  ImageLutGfx11 image_lut_gfx11;
+  DISALLOW_COPY_AND_ASSIGN(ImageManagerGfx12);
+};
+
+}  // namespace image
+}  // namespace rocr
+#endif  // EXT_IMAGE_IMAGE_MANAGER_GFX12_H_
diff --git a/src/image/image_manager_nv.cpp b/src/image/image_manager_nv.cpp
index 139a3755c..cb897f196 100755
--- a/src/image/image_manager_nv.cpp
+++ b/src/image/image_manager_nv.cpp
@@ -698,11 +698,6 @@ uint32_t ImageManagerNv::GetAddrlibSurfaceInfoNv(
       prefSettingsInput.forbiddenBlock.macroThick64KB = 1;
       prefSettingsInput.forbiddenBlock.micro = 1;
       prefSettingsInput.forbiddenBlock.var = 1;
-  } else {
-      // Debug setting, simplifies buffer alignment until language runtimes have official gfx10
-      // support.
-      prefSettingsInput.forbiddenBlock.macroThin64KB = 1;
-      prefSettingsInput.forbiddenBlock.macroThick64KB = 1;
   }
 
   // but don't ever allow the 256b swizzle modes
diff --git a/src/image/image_runtime.cpp b/src/image/image_runtime.cpp
index dc4109cd1..3e015be94 100755
--- a/src/image/image_runtime.cpp
+++ b/src/image/image_runtime.cpp
@@ -55,6 +55,7 @@
 #include "image_manager_ai.h"
 #include "image_manager_nv.h"
 #include "image_manager_gfx11.h"
+#include "image_manager_gfx12.h"
 #include "device_info.h"
 
 namespace rocr {
@@ -110,14 +111,22 @@ hsa_status_t ImageRuntime::CreateImageManager(hsa_agent_t agent, void* data) {
 
     ImageManager* image_manager;
 
-    if (major_ver >= 11) {
+    switch (major_ver) {
+    case 12:
+      image_manager = new ImageManagerGfx12();
+      break;
+    case 11:
       image_manager = new ImageManagerGfx11();
-    } else if (major_ver >= 10) {
+      break;
+    case 10:
       image_manager = new ImageManagerNv();
-    } else if (major_ver >= 9) {
+      break;
+    case  9:
       image_manager = new ImageManagerAi();
-    } else {
+      break;
+    default:
       image_manager = new ImageManagerKv();
+      break;
     }
     hsa_error_code = image_manager->Initialize(agent);
 
diff --git a/src/image/resource_gfx12.h b/src/image/resource_gfx12.h
new file mode 100644
index 000000000..6b0bd5648
--- /dev/null
+++ b/src/image/resource_gfx12.h
@@ -0,0 +1,814 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef EXT_IMAGE_RESOURCE_GFX12_H_
+#define EXT_IMAGE_RESOURCE_GFX12_H_
+
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+namespace rocr {
+namespace image {
+
+/**********************************************************/
+/**********************************************************/
+#define SQ_BUF_RSC_WRD0_REG_SZ 32
+#define SQ_BUF_RSC_WRD0_BASE_ADDRESS_SZ 32
+
+struct sq_buf_rsrc_word0_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int BASE_ADDRESS : SQ_BUF_RSC_WRD0_BASE_ADDRESS_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int BASE_ADDRESS : SQ_BUF_RSC_WRD0_BASE_ADDRESS_SZ;
+#endif
+};
+
+union SQ_BUF_RSRC_WORD0 {
+  sq_buf_rsrc_word0_t bitfields, bits, f;
+  uint32_t val : SQ_BUF_RSC_WRD0_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+
+/***********/
+
+/* Note: These registers are also defined/used in registers.h
+ * in SQ_BUF_RSRC_WORD*_GFX12
+ */
+#define SQ_BUF_RSC_WRD1_REG_SZ 32
+#define SQ_BUF_RSC_WRD1_BASE_ADDRESS_HI_SZ  16
+#define SQ_BUF_RSC_WRD1_STRIDE_SZ           14
+#define SQ_BUF_RSC_WRD1_SWIZZLE_ENABLE_SZ   2
+struct sq_buf_rsrc_word1_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int BASE_ADDRESS_HI : SQ_BUF_RSC_WRD1_BASE_ADDRESS_HI_SZ;
+  unsigned int STRIDE          : SQ_BUF_RSC_WRD1_STRIDE_SZ;
+  unsigned int SWIZZLE_ENABLE  : SQ_BUF_RSC_WRD1_SWIZZLE_ENABLE_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int SWIZZLE_ENABLE  : SQ_BUF_RSC_WRD1_SWIZZLE_ENABLE_SZ;
+  unsigned int STRIDE          : SQ_BUF_RSC_WRD1_STRIDE_SZ;
+  unsigned int BASE_ADDRESS_HI : SQ_BUF_RSC_WRD1_BASE_ADDRESS_HI_SZ;
+#endif
+};
+
+union SQ_BUF_RSRC_WORD1 {
+  sq_buf_rsrc_word1_t bitfields, bits, f;
+  uint32_t val : SQ_BUF_RSC_WRD1_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_BUF_RSC_WRD2_REG_SZ 32
+#define SQ_BUF_RSC_WRD2_NUM_RECORDS_SZ 32
+struct sq_buf_rsrc_word2_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int NUM_RECORDS : SQ_BUF_RSC_WRD2_NUM_RECORDS_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int NUM_RECORDS : SQ_BUF_RSC_WRD2_NUM_RECORDS_SZ;
+#endif
+};
+union SQ_BUF_RSRC_WORD2 {
+  sq_buf_rsrc_word2_t bitfields, bits, f;
+  uint32_t val : SQ_BUF_RSC_WRD2_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_BUF_RSC_WRD3_REG_SZ 32
+#define SQ_BUF_RSC_WRD3_DST_SEL_X_SZ        3
+#define SQ_BUF_RSC_WRD3_DST_SEL_Y_SZ        3
+#define SQ_BUF_RSC_WRD3_DST_SEL_Z_SZ        3
+#define SQ_BUF_RSC_WRD3_DST_SEL_W_SZ        3
+#define SQ_BUF_RSC_WRD3_FORMAT_SZ           6
+#define SQ_BUF_RSC_WRD3_INDEX_STRIDE_SZ     2
+#define SQ_BUF_RSC_WRD3_ADD_TID_ENABLE_SZ   1
+#define SQ_BUF_RSC_WRD3_WRITE_COMPRESS_ENABLE_SZ   1
+#define SQ_BUF_RSC_WRD3_COMPRESSION_EN_SZ          1
+#define SQ_BUF_RSC_WRD3_COMPRESSION_ACCESS_MODE_SZ 2
+#define SQ_BUF_RSC_WORD3_OOB_SELECT_SZ      2
+#define SQ_BUF_RSC_WRD3_TYPE_SZ             2
+struct sq_buf_rsrc_word3_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int DST_SEL_X      : SQ_BUF_RSC_WRD3_DST_SEL_X_SZ;
+  unsigned int DST_SEL_Y      : SQ_BUF_RSC_WRD3_DST_SEL_Y_SZ;
+  unsigned int DST_SEL_Z      : SQ_BUF_RSC_WRD3_DST_SEL_Z_SZ;
+  unsigned int DST_SEL_W      : SQ_BUF_RSC_WRD3_DST_SEL_W_SZ;
+  unsigned int FORMAT         : SQ_BUF_RSC_WRD3_FORMAT_SZ;
+  unsigned int                : 3;
+  unsigned int INDEX_STRIDE   : SQ_BUF_RSC_WRD3_INDEX_STRIDE_SZ;
+  unsigned int ADD_TID_ENABLE : SQ_BUF_RSC_WRD3_ADD_TID_ENABLE_SZ;
+  unsigned int WRITE_COMPRESS_ENABLE : SQ_BUF_RSC_WRD3_WRITE_COMPRESS_ENABLE_SZ;
+  unsigned int COMPRESSION_EN : SQ_BUF_RSC_WRD3_COMPRESSION_EN_SZ;
+  unsigned int COMPRESSION_ACCESS_MODE : SQ_BUF_RSC_WRD3_COMPRESSION_ACCESS_MODE_SZ;
+  unsigned int OOB_SELECT     : SQ_BUF_RSC_WORD3_OOB_SELECT_SZ;
+  unsigned int TYPE           : SQ_BUF_RSC_WRD3_TYPE_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int TYPE           : SQ_BUF_RSC_WRD3_TYPE_SZ;
+  unsigned int OOB_SELECT     : SQ_BUF_RSC_WORD3_OOB_SELECT_SZ;
+  unsigned int COMPRESSION_ACCESS_MODE : SQ_BUF_RSC_WRD3_COMPRESSION_ACCESS_MODE_SZ;
+  unsigned int COMPRESSION_EN : SQ_BUF_RSC_WRD3_COMPRESSION_EN_SZ;
+  unsigned int WRITE_COMPRESS_ENABLE : SQ_BUF_RSC_WRD3_WRITE_COMPRESS_ENABLE_SZ;
+  unsigned int ADD_TID_ENABLE : SQ_BUF_RSC_WRD3_ADD_TID_ENABLE_SZ;
+  unsigned int INDEX_STRIDE   : SQ_BUF_RSC_WRD3_INDEX_STRIDE_SZ;
+  unsigned int                : 3;
+  unsigned int FORMAT         : SQ_BUF_RSC_WRD3_FORMAT_SZ;
+  unsigned int DST_SEL_W      : SQ_BUF_RSC_WRD3_DST_SEL_W_SZ;
+  unsigned int DST_SEL_Z      : SQ_BUF_RSC_WRD3_DST_SEL_Z_SZ;
+  unsigned int DST_SEL_Y      : SQ_BUF_RSC_WRD3_DST_SEL_Y_SZ;
+  unsigned int DST_SEL_X      : SQ_BUF_RSC_WRD3_DST_SEL_X_SZ;
+#endif
+};
+union SQ_BUF_RSRC_WORD3 {
+  sq_buf_rsrc_word3_t bitfields, bits, f;
+  uint32_t val : SQ_BUF_RSC_WRD3_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+/**********************************************************/
+/**********************************************************/
+#define SQ_IMG_RSC_WRD0_REG_SZ 32
+#define SQ_IMG_RSC_WRD0_BASE_ADDRESS_SZ 32
+struct sq_img_rsrc_word0_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int BASE_ADDRESS : SQ_IMG_RSC_WRD0_BASE_ADDRESS_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int BASE_ADDRESS : SQ_IMG_RSC_WRD0_BASE_ADDRESS_SZ;
+#endif
+};
+union SQ_IMG_RSRC_WORD0 {
+  sq_img_rsrc_word0_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD0_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD1_REG_SZ 32
+#define SQ_IMG_RSC_WRD1_BASE_ADDRESS_HI_SZ  8
+#define SQ_IMG_RSC_WRD1_MAX_MIP_SZ          5
+#define SQ_IMG_RSC_WRD1_FORMAT_SZ           8
+#define SQ_IMG_RSC_WRD1_BASE_LEVEL_SZ       5
+#define SQ_IMG_RSC_WRD1_WIDTH_LO            2
+
+struct sq_img_rsrc_word1_t{
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int BASE_ADDRESS_HI : SQ_IMG_RSC_WRD1_BASE_ADDRESS_HI_SZ;
+  unsigned int                 : 4;
+  unsigned int MAX_MIP         : SQ_IMG_RSC_WRD1_MAX_MIP_SZ;
+  unsigned int FORMAT          : SQ_IMG_RSC_WRD1_FORMAT_SZ;
+  unsigned int BASE_LEVEL      : SQ_IMG_RSC_WRD1_BASE_LEVEL_SZ;
+  unsigned int WIDTH           : SQ_IMG_RSC_WRD1_WIDTH_LO;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int WIDTH           : SQ_IMG_RSC_WRD1_WIDTH_LO;
+  unsigned int BASE_LEVEL      : SQ_IMG_RSC_WRD1_BASE_LEVEL_SZ;
+  unsigned int FORMAT          : SQ_IMG_RSC_WRD1_FORMAT_SZ;
+  unsigned int MAX_MIP         : SQ_IMG_RSC_WRD1_MAX_MIP_SZ;
+  unsigned int                 : 4;
+  unsigned int BASE_ADDRESS_HI : SQ_IMG_RSC_WRD1_BASE_ADDRESS_HI_SZ;
+#endif
+};
+union SQ_IMG_RSRC_WORD1 {
+  sq_img_rsrc_word1_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD1_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD2_REG_SZ 32
+#define SQ_IMG_RSC_WRD2_WIDTH_HI_SZ        14
+#define SQ_IMG_RSC_WRD2_HEIGHT_SZ          16
+struct sq_img_rsrc_word2_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int WIDTH_HI       : SQ_IMG_RSC_WRD2_WIDTH_HI_SZ;
+  unsigned int HEIGHT         : SQ_IMG_RSC_WRD2_HEIGHT_SZ;
+  unsigned int                : 2;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int                : 2;
+  unsigned int HEIGHT         : SQ_IMG_RSC_WRD2_HEIGHT_SZ;
+  unsigned int WIDTH_HI       : SQ_IMG_RSC_WRD2_WIDTH_HI_SZ;
+#endif
+};
+union SQ_IMG_RSRC_WORD2 {
+  sq_img_rsrc_word2_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD2_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD3_REG_SZ 32
+#define SQ_IMG_RSC_WRD3_DST_SEL_X_SZ  3
+#define SQ_IMG_RSC_WRD3_DST_SEL_Y_SZ  3
+#define SQ_IMG_RSC_WRD3_DST_SEL_Z_SZ  3
+#define SQ_IMG_RSC_WRD3_DST_SEL_W_SZ  3
+#define SQ_IMG_RSC_WRD3_NO_EDGE_CLAMP_SZ 1
+#define SQ_IMG_RSC_WRD3_LAST_LEVEL_SZ 5
+#define SQ_IMG_RSC_WRD3_SW_MODE_SZ    5
+#define SQ_IMG_RSC_WRD3_BC_SWIZZLE_SZ 3
+#define SQ_IMG_RSC_WRD3_TYPE_SZ       4
+struct sq_img_rsrc_word3_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int DST_SEL_X  : SQ_IMG_RSC_WRD3_DST_SEL_X_SZ;
+  unsigned int DST_SEL_Y  : SQ_IMG_RSC_WRD3_DST_SEL_Y_SZ;
+  unsigned int DST_SEL_Z  : SQ_IMG_RSC_WRD3_DST_SEL_Z_SZ;
+  unsigned int DST_SEL_W  : SQ_IMG_RSC_WRD3_DST_SEL_W_SZ;
+  unsigned int NO_EDGE_CLAMP : SQ_IMG_RSC_WRD3_NO_EDGE_CLAMP_SZ;
+  unsigned int            : 2;
+  unsigned int LAST_LEVEL : SQ_IMG_RSC_WRD3_LAST_LEVEL_SZ;
+  unsigned int SW_MODE    : SQ_IMG_RSC_WRD3_SW_MODE_SZ;
+  unsigned int BC_SWIZZLE : SQ_IMG_RSC_WRD3_BC_SWIZZLE_SZ;
+  unsigned int TYPE       : SQ_IMG_RSC_WRD3_TYPE_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int TYPE       : SQ_IMG_RSC_WRD3_TYPE_SZ;
+  unsigned int BC_SWIZZLE : SQ_IMG_RSC_WRD3_BC_SWIZZLE_SZ;
+  unsigned int SW_MODE    : SQ_IMG_RSC_WRD3_SW_MODE_SZ;
+  unsigned int LAST_LEVEL : SQ_IMG_RSC_WRD3_LAST_LEVEL_SZ;
+  unsigned int            : 2;
+  unsigned int NO_EDGE_CLAMP : SQ_IMG_RSC_WRD3_NO_EDGE_CLAMP_SZ;
+  unsigned int DST_SEL_W  : SQ_IMG_RSC_WRD3_DST_SEL_W_SZ;
+  unsigned int DST_SEL_Z  : SQ_IMG_RSC_WRD3_DST_SEL_Z_SZ;
+  unsigned int DST_SEL_Y  : SQ_IMG_RSC_WRD3_DST_SEL_Y_SZ;
+  unsigned int DST_SEL_X  : SQ_IMG_RSC_WRD3_DST_SEL_X_SZ;
+#endif
+};
+union SQ_IMG_RSRC_WORD3 {
+  sq_img_rsrc_word3_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD3_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD4_REG_SZ 32
+#define SQ_IMG_RSC_WRD4_DEPTH_SZ    14
+#define SQ_IMG_RSC_WRD4_PITCH_MSB_SZ 2
+#define SQ_IMG_RSC_WRD4_BASE_ARR_SZ 13
+#define SQ_IMG_RSC_WRD4_BASE_ARRAY_MSB_SZ 1
+
+struct sq_img_rsrc_word4_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int DEPTH      : SQ_IMG_RSC_WRD4_DEPTH_SZ;
+  unsigned int PITCH_MSB  : SQ_IMG_RSC_WRD4_PITCH_MSB_SZ;
+  unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ;
+  unsigned int BASE_ARRAY_MSB : SQ_IMG_RSC_WRD4_BASE_ARRAY_MSB_SZ;
+  unsigned int            : 2;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int            : 2;
+  unsigned int BASE_ARRAY_MSB : SQ_IMG_RSC_WRD4_BASE_ARRAY_MSB_SZ;
+  unsigned int BASE_ARRAY : SQ_IMG_RSC_WRD4_BASE_ARR_SZ;
+  unsigned int PITCH_MSB  : SQ_IMG_RSC_WRD4_PITCH_MSB_SZ;
+  unsigned int DEPTH      : SQ_IMG_RSC_WRD4_DEPTH_SZ;
+#endif
+};
+union SQ_IMG_RSRC_WORD4 {
+  sq_img_rsrc_word4_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD4_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD5_REG_SZ 32
+#define SQ_IMG_RSC_WRD5_UAV3D_SZ                     1
+#define SQ_IMG_RSC_WRD5_DEPTH_SCALE_SZ               5
+#define SQ_IMG_RSC_WRD5_HEIGHT_SCALE_SZ              5
+#define SQ_IMG_RSC_WRD5_WIDTH_SCALE_SZ               5  // Combined two consecutive separate fields width[0:2] and width[3:4].
+#define SQ_IMG_RSC_WRD5_PERF_MOD_SZ                  3
+#define SQ_IMG_RSC_WRD5_CORNER_SAMPLES_SZ            1
+#define SQ_IMG_RSC_WRD5_LINKED_RESOURCE_SZ           1
+#define SQ_IMG_RSC_WRD5_LOD_HWD_CNT_EN_SZ            1
+#define SQ_IMG_RSC_WRD5_MIN_LOD_LO_SZ                6  // lowest 6 bits of MIN_LOD (13 bit total)
+
+struct sq_img_rsrc_word5_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int                      : 4;
+  unsigned int UAV3D                : SQ_IMG_RSC_WRD5_UAV3D_SZ;
+  unsigned int DEPTH_SCALE          : SQ_IMG_RSC_WRD5_DEPTH_SCALE_SZ;
+  unsigned int HEIGHT_SCALE         : SQ_IMG_RSC_WRD5_HEIGHT_SCALE_SZ;
+  unsigned int WIDTH_SCALE          : SQ_IMG_RSC_WRD5_WIDTH_SCALE_SZ;
+  unsigned int PERF_MOD             : SQ_IMG_RSC_WRD5_PERF_MOD_SZ;
+  unsigned int CORNER_SAMPLES       : SQ_IMG_RSC_WRD5_CORNER_SAMPLES_SZ;
+  unsigned int LINKED_RESOURCE      : SQ_IMG_RSC_WRD5_LINKED_RESOURCE_SZ;
+  unsigned int LOD_HWD_CNT_EN       : SQ_IMG_RSC_WRD5_LOD_HWD_CNT_EN_SZ;
+  unsigned int MIN_LOD_LO           : SQ_IMG_RSC_WRD5_MIN_LOD_LO_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int MIN_LOD_LO           : SQ_IMG_RSC_WRD5_MIN_LOD_LO_SZ;
+  unsigned int LOD_HWD_CNT_EN       : SQ_IMG_RSC_WRD5_LOD_HWD_CNT_EN_SZ;
+  unsigned int LINKED_RESOURCE      : SQ_IMG_RSC_WRD5_LINKED_RESOURCE_SZ;
+  unsigned int CORNER_SAMPLES       : SQ_IMG_RSC_WRD5_CORNER_SAMPLES_SZ;
+  unsigned int PERF_MOD             : SQ_IMG_RSC_WRD5_PERF_MOD_SZ;
+  unsigned int WIDTH_SCALE          : SQ_IMG_RSC_WRD5_WIDTH_SCALE_SZ;
+  unsigned int HEIGHT_SCALE         : SQ_IMG_RSC_WRD5_HEIGHT_SCALE_SZ;
+  unsigned int DEPTH_SCALE          : SQ_IMG_RSC_WRD5_DEPTH_SCALE_SZ;
+  unsigned int UAV3D                : SQ_IMG_RSC_WRD5_UAV3D_SZ;
+  unsigned int                      : 4;
+#endif
+};
+
+union SQ_IMG_RSRC_WORD5 {
+  sq_img_rsrc_word5_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD5_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD6_REG_SZ 32
+
+#define SQ_IMG_RSC_WRD6_MIN_LOD_HI_SZ              7
+#define SQ_IMG_RSC_WRD5_COUNTER_BANK_ID_SZ         8  // 3 fields combined into bank_id
+#define SQ_IMG_RSC_WRD6_MAX_UNCOMP_BLK_SZ_SZ       1
+#define SQ_IMG_RSC_WRD6_MAX_COMP_BLK_SZ_SZ         2
+#define SQ_IMG_RSC_WRD6_WRITE_COMPRESS_EN_SZ       1
+#define SQ_IMG_RSC_WRD6_COMPRESSION_ENABLE_SZ      1
+#define SQ_IMG_RSC_WRD6_COMPRESSION_ACCESS_MODE_SZ 2
+#define SQ_IMG_RSC_WRD6_SPECULATIVE_READ_SZ        2
+
+struct sq_img_rsrc_word6_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int MIN_LOD_HI            : SQ_IMG_RSC_WRD6_MIN_LOD_HI_SZ;
+  unsigned int COUNTER_BANK_ID       : SQ_IMG_RSC_WRD5_COUNTER_BANK_ID_SZ;
+  unsigned int MAX_UNCOMP_BLK_SZ     : SQ_IMG_RSC_WRD6_MAX_UNCOMP_BLK_SZ_SZ;
+  unsigned int                       : 1;
+  unsigned int MAX_COMP_BLK_SZ       : SQ_IMG_RSC_WRD6_MAX_COMP_BLK_SZ_SZ;
+  unsigned int                       : 1;
+  unsigned int WRITE_COMPRESS_ENABLE : SQ_IMG_RSC_WRD6_WRITE_COMPRESS_EN_SZ;
+  unsigned int COMPRESSION_ENABLE    : SQ_IMG_RSC_WRD6_COMPRESSION_ENABLE_SZ;
+  unsigned int COMPRESSION_ACCESS_MODE : SQ_IMG_RSC_WRD6_COMPRESSION_ACCESS_MODE_SZ;
+  unsigned int SPECULATIVE_READ      : SQ_IMG_RSC_WRD6_SPECULATIVE_READ_SZ;
+  unsigned int                       : 6;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int                       : 6;
+  unsigned int SPECULATIVE_READ      : SQ_IMG_RSC_WRD6_SPECULATIVE_READ_SZ;
+  unsigned int COMPRESSION_ACCESS_MODE : SQ_IMG_RSC_WRD6_COMPRESSION_ACCESS_MODE_SZ;
+  unsigned int COMPRESSION_ENABLE    : SQ_IMG_RSC_WRD6_COMPRESSION_ENABLE_SZ;
+  unsigned int WRITE_COMPRESS_ENABLE : SQ_IMG_RSC_WRD6_WRITE_COMPRESS_EN_SZ;
+  unsigned int                       : 1;
+  unsigned int MAX_COMP_BLK_SZ       : SQ_IMG_RSC_WRD6_MAX_COMP_BLK_SZ_SZ;
+  unsigned int                       : 1;
+  unsigned int MAX_UNCOMP_BLK_SZ     : SQ_IMG_RSC_WRD6_MAX_UNCOMP_BLK_SZ_SZ;
+  unsigned int COUNTER_BANK_ID       : SQ_IMG_RSC_WRD5_COUNTER_BANK_ID_SZ;
+  unsigned int MIN_LOD_HI            : SQ_IMG_RSC_WRD6_MIN_LOD_HI_SZ;
+#endif
+};
+union SQ_IMG_RSRC_WORD6 {
+  sq_img_rsrc_word6_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD6_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_RSC_WRD7_REG_SZ 32
+struct sq_img_rsrc_word7_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int : 32;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int : 32;
+#endif
+};
+union SQ_IMG_RSRC_WORD7 {
+  sq_img_rsrc_word7_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_RSC_WRD7_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+/**********************************************************/
+/**********************************************************/
+
+
+
+	
+#define SQ_IMG_SAMP_WORD0_REG_SZ 32
+#define SQ_IMG_SAMP_WORD0_CLAMP_X_SZ            3
+#define SQ_IMG_SAMP_WORD0_CLAMP_Y_SZ            3
+#define SQ_IMG_SAMP_WORD0_CLAMP_Z_SZ            3
+#define SQ_IMG_SAMP_WORD0_MAX_ANISO_RATIO_SZ    3
+#define SQ_IMG_SAMP_WORD0_DEPTH_COMPARE_FUNC_SZ 3
+#define SQ_IMG_SAMP_WORD0_FORCE_UNNORMALIZED_SZ 1
+#define SQ_IMG_SAMP_WORD0_ANISO_THRESHOLD_SZ    3
+#define SQ_IMG_SAMP_WORD0_MC_COORD_TRUNC_SZ     1
+#define SQ_IMG_SAMP_WORD0_FORCE_DEGAMMA_SZ      1
+#define SQ_IMG_SAMP_WORD0_ANISO_BIAS_SZ         6
+#define SQ_IMG_SAMP_WORD0_TRUNC_COORD_SZ        1
+#define SQ_IMG_SAMP_WORD0_DISABLE_CUBE_WRAP_SZ  1
+#define SQ_IMG_SAMP_WORD0_FILTER_MODE_SZ        2
+#define SQ_IMG_SAMP_WORD0_SKIP_DEGAMMA_SZ       1
+struct sq_img_samp_word0_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int CLAMP_X            : SQ_IMG_SAMP_WORD0_CLAMP_X_SZ;
+  unsigned int CLAMP_Y            : SQ_IMG_SAMP_WORD0_CLAMP_Y_SZ;
+  unsigned int CLAMP_Z            : SQ_IMG_SAMP_WORD0_CLAMP_Z_SZ;
+  unsigned int MAX_ANISO_RATIO    : SQ_IMG_SAMP_WORD0_MAX_ANISO_RATIO_SZ;
+  unsigned int DEPTH_COMPARE_FUNC : SQ_IMG_SAMP_WORD0_DEPTH_COMPARE_FUNC_SZ;
+  unsigned int FORCE_UNNORMALIZED : SQ_IMG_SAMP_WORD0_FORCE_UNNORMALIZED_SZ;
+  unsigned int ANISO_THRESHOLD    : SQ_IMG_SAMP_WORD0_ANISO_THRESHOLD_SZ;
+  unsigned int MC_COORD_TRUNC     : SQ_IMG_SAMP_WORD0_MC_COORD_TRUNC_SZ;
+  unsigned int FORCE_DEGAMMA      : SQ_IMG_SAMP_WORD0_FORCE_DEGAMMA_SZ;
+  unsigned int ANISO_BIAS         : SQ_IMG_SAMP_WORD0_ANISO_BIAS_SZ;
+  unsigned int TRUNC_COORD        : SQ_IMG_SAMP_WORD0_TRUNC_COORD_SZ;
+  unsigned int DISABLE_CUBE_WRAP  : SQ_IMG_SAMP_WORD0_DISABLE_CUBE_WRAP_SZ;
+  unsigned int FILTER_MODE        : SQ_IMG_SAMP_WORD0_FILTER_MODE_SZ;
+  unsigned int SKIP_DEGAMMA       : SQ_IMG_SAMP_WORD0_SKIP_DEGAMMA_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int SKIP_DEGAMMA       : SQ_IMG_SAMP_WORD0_SKIP_DEGAMMA_SZ;
+  unsigned int FILTER_MODE        : SQ_IMG_SAMP_WORD0_FILTER_MODE_SZ;
+  unsigned int DISABLE_CUBE_WRAP  : SQ_IMG_SAMP_WORD0_DISABLE_CUBE_WRAP_SZ;
+  unsigned int TRUNC_COORD        : SQ_IMG_SAMP_WORD0_TRUNC_COORD_SZ;
+  unsigned int ANISO_BIAS         : SQ_IMG_SAMP_WORD0_ANISO_BIAS_SZ;
+  unsigned int FORCE_DEGAMMA      : SQ_IMG_SAMP_WORD0_FORCE_DEGAMMA_SZ;
+  unsigned int MC_COORD_TRUNC     : SQ_IMG_SAMP_WORD0_MC_COORD_TRUNC_SZ;
+  unsigned int ANISO_THRESHOLD    : SQ_IMG_SAMP_WORD0_ANISO_THRESHOLD_SZ;
+  unsigned int FORCE_UNNORMALIZED : SQ_IMG_SAMP_WORD0_FORCE_UNNORMALIZED_SZ;
+  unsigned int DEPTH_COMPARE_FUNC : SQ_IMG_SAMP_WORD0_DEPTH_COMPARE_FUNC_SZ;
+  unsigned int MAX_ANISO_RATIO    : SQ_IMG_SAMP_WORD0_MAX_ANISO_RATIO_SZ;
+  unsigned int CLAMP_Z            : SQ_IMG_SAMP_WORD0_CLAMP_Z_SZ;
+  unsigned int CLAMP_Y            : SQ_IMG_SAMP_WORD0_CLAMP_Y_SZ;
+  unsigned int CLAMP_X            : SQ_IMG_SAMP_WORD0_CLAMP_X_SZ;
+#endif
+};
+
+union SQ_IMG_SAMP_WORD0 {
+  sq_img_samp_word0_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_SAMP_WORD0_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_SAMP_WORD1_REG_SZ 32
+#define SQ_IMG_SAMP_WORD1_MIN_LOD_SZ 13
+#define SQ_IMG_SAMP_WORD1_MAX_LOD_SZ 13
+#define SQ_IMG_SAMP_WORD1_PERF_Z_SZ   4
+struct sq_img_samp_word1_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int MIN_LOD  : SQ_IMG_SAMP_WORD1_MIN_LOD_SZ;
+  unsigned int MAX_LOD  : SQ_IMG_SAMP_WORD1_MAX_LOD_SZ;
+  unsigned int : 2;
+  unsigned int PERF_Z   : SQ_IMG_SAMP_WORD1_PERF_Z_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int PERF_Z   : SQ_IMG_SAMP_WORD1_PERF_Z_SZ;
+  unsigned int : 2;
+  unsigned int MAX_LOD  : SQ_IMG_SAMP_WORD1_MAX_LOD_SZ;
+  unsigned int MIN_LOD  : SQ_IMG_SAMP_WORD1_MIN_LOD_SZ;
+#endif
+};
+
+union SQ_IMG_SAMP_WORD1 {
+  sq_img_samp_word1_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_SAMP_WORD1_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+#define SQ_IMG_SAMP_WORD2_REG_SZ 32
+#define SQ_IMG_SAMP_WORD2_LOD_BIAS_SZ            14
+#define SQ_IMG_SAMP_WORD2_LOD_BIAS_SEC_SZ         6
+#define SQ_IMG_SAMP_WORD2_XY_MAG_FILTER_SZ        2
+#define SQ_IMG_SAMP_WORD2_XY_MIN_FILTER_SZ        2
+#define SQ_IMG_SAMP_WORD2_Z_FILTER_SZ             2
+#define SQ_IMG_SAMP_WORD2_MIP_FILTER_SZ           2
+#define SQ_IMG_SAMP_WORD2_ANISO_OVERRIDE_SZ       1
+#define SQ_IMG_SAMP_WORD2_PERF_MIP_LO_SZ          2
+struct sq_img_samp_word2_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int LOD_BIAS           : SQ_IMG_SAMP_WORD2_LOD_BIAS_SZ;
+  unsigned int LOD_BIAS_SEC       : SQ_IMG_SAMP_WORD2_LOD_BIAS_SEC_SZ;
+  unsigned int XY_MAG_FILTER      : SQ_IMG_SAMP_WORD2_XY_MAG_FILTER_SZ;
+  unsigned int XY_MIN_FILTER      : SQ_IMG_SAMP_WORD2_XY_MIN_FILTER_SZ;
+  unsigned int Z_FILTER           : SQ_IMG_SAMP_WORD2_Z_FILTER_SZ;
+  unsigned int MIP_FILTER         : SQ_IMG_SAMP_WORD2_MIP_FILTER_SZ;
+  unsigned int                    : 1;
+  unsigned int ANISO_OVERRIDE     : SQ_IMG_SAMP_WORD2_ANISO_OVERRIDE_SZ;
+  unsigned int PERF_MIP_LO        : SQ_IMG_SAMP_WORD2_PERF_MIP_LO_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int PERF_MIP_LO        : SQ_IMG_SAMP_WORD2_PERF_MIP_LO_SZ;
+  unsigned int ANISO_OVERRIDE     : SQ_IMG_SAMP_WORD2_ANISO_OVERRIDE_SZ;
+  unsigned int                    : 1;
+  unsigned int MIP_FILTER         : SQ_IMG_SAMP_WORD2_MIP_FILTER_SZ;
+  unsigned int Z_FILTER           : SQ_IMG_SAMP_WORD2_Z_FILTER_SZ;
+  unsigned int XY_MIN_FILTER      : SQ_IMG_SAMP_WORD2_XY_MIN_FILTER_SZ;
+  unsigned int XY_MAG_FILTER      : SQ_IMG_SAMP_WORD2_XY_MAG_FILTER_SZ;
+  unsigned int LOD_BIAS_SEC       : SQ_IMG_SAMP_WORD2_LOD_BIAS_SEC_SZ;
+  unsigned int LOD_BIAS           : SQ_IMG_SAMP_WORD2_LOD_BIAS_SZ;
+#endif
+};
+
+union SQ_IMG_SAMP_WORD2 {
+  sq_img_samp_word2_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_SAMP_WORD2_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+
+#define SQ_IMG_SAMP_WORD3_REG_SZ 32
+#define SQ_IMG_SAMP_WORD3_PERF_MIP_HI_SZ       2
+#define SQ_IMG_SAMP_WORD3_BORDER_COLOR_PTR_SZ 12
+#define SQ_IMG_SAMP_WORD3_BORD_COLOR_TYPE_SZ   2
+
+struct sq_img_samp_word3_t {
+#if defined(LITTLEENDIAN_CPU)
+  unsigned int PERF_MIP_HI       : SQ_IMG_SAMP_WORD3_PERF_MIP_HI_SZ;
+  unsigned int                   : 16;
+  unsigned int BORDER_COLOR_PTR  : SQ_IMG_SAMP_WORD3_BORDER_COLOR_PTR_SZ;
+  unsigned int BORDER_COLOR_TYPE : SQ_IMG_SAMP_WORD3_BORD_COLOR_TYPE_SZ;
+#elif defined(BIGENDIAN_CPU)
+  unsigned int BORDER_COLOR_TYPE : SQ_IMG_SAMP_WORD3_BORD_COLOR_TYPE_SZ;
+  unsigned int BORDER_COLOR_PTR  : SQ_IMG_SAMP_WORD3_BORDER_COLOR_PTR_SZ;
+  unsigned int                   : 16;
+  unsigned int PERF_MIP_HI       : SQ_IMG_SAMP_WORD3_PERF_MIP_HI_SZ;
+#endif
+};
+
+union SQ_IMG_SAMP_WORD3 {
+  sq_img_samp_word3_t bitfields, bits, f;
+  uint32_t val : SQ_IMG_SAMP_WORD3_REG_SZ;
+  uint32_t u32All;
+  int32_t  i32All;
+  float    f32All;
+};
+/***********/
+
+/**************************************************************/
+/**************************************************************/
+/**************************************************************/
+
+typedef enum FMT {
+FMT_INVALID                              = 0x00000000,
+FMT_8                                    = 0x00000001,
+FMT_16                                   = 0x00000002,
+FMT_8_8                                  = 0x00000003,
+FMT_32                                   = 0x00000004,
+FMT_16_16                                = 0x00000005,
+FMT_10_11_11                             = 0x00000006,
+FMT_11_11_10                             = 0x00000007,
+FMT_10_10_10_2                           = 0x00000008,
+FMT_2_10_10_10                           = 0x00000009,
+FMT_8_8_8_8                              = 0x0000000a,
+FMT_32_32                                = 0x0000000b,
+FMT_16_16_16_16                          = 0x0000000c,
+FMT_32_32_32                             = 0x0000000d,
+FMT_32_32_32_32                          = 0x0000000e,
+FMT_RESERVED_78                          = 0x0000000f,
+FMT_5_6_5                                = 0x00000010,
+FMT_1_5_5_5                              = 0x00000011,
+FMT_5_5_5_1                              = 0x00000012,
+FMT_4_4_4_4                              = 0x00000013,
+FMT_8_24                                 = 0x00000014,
+FMT_24_8                                 = 0x00000015,
+FMT_X24_8_32                             = 0x00000016,
+FMT_RESERVED_155                         = 0x00000017,
+} FMT;
+
+typedef enum type {
+TYPE_UNORM                               = 0x00000000,
+TYPE_SNORM                               = 0x00000001,
+TYPE_USCALED                             = 0x00000002,
+TYPE_SSCALED                             = 0x00000003,
+TYPE_UINT                                = 0x00000004,
+TYPE_SINT                                = 0x00000005,
+TYPE_SRGB                                = 0x00000006,
+TYPE_FLOAT                               = 0x00000007,
+TYPE_RESERVED_8                          = 0x00000008,
+TYPE_RESERVED_9                          = 0x00000009,
+TYPE_UNORM_UINT                          = 0x0000000a,
+TYPE_REVERSED_UNORM                      = 0x0000000b,
+TYPE_FLOAT_CLAMP                         = 0x0000000c,
+} type;
+
+enum FORMAT {
+CFMT_INVALID             = 0,
+CFMT_8_UNORM             = 1,
+CFMT_8_SNORM             = 2,
+CFMT_8_USCALED           = 3,
+CFMT_8_SSCALED           = 4,
+CFMT_8_UINT              = 5,
+CFMT_8_SINT              = 6,
+CFMT_16_UNORM            = 7,
+CFMT_16_SNORM            = 8,
+CFMT_16_USCALED          = 9,
+CFMT_16_SSCALED          = 10,
+CFMT_16_UINT             = 11,
+CFMT_16_SINT             = 12,
+CFMT_16_FLOAT            = 13,
+CFMT_8_8_UNORM           = 14,
+CFMT_8_8_SNORM           = 15,
+CFMT_8_8_USCALED         = 16,
+CFMT_8_8_SSCALED         = 17,
+CFMT_8_8_UINT            = 18,
+CFMT_8_8_SINT            = 19,
+CFMT_32_UINT             = 20,
+CFMT_32_SINT             = 21,
+CFMT_32_FLOAT            = 22,
+CFMT_16_16_UNORM         = 23,
+CFMT_16_16_SNORM         = 24,
+CFMT_16_16_USCALED       = 25,
+CFMT_16_16_SSCALED       = 26,
+CFMT_16_16_UINT          = 27,
+CFMT_16_16_SINT          = 28,
+CFMT_16_16_FLOAT         = 29,
+CFMT_10_11_11_FLOAT      = 30,
+CFMT_11_11_10_FLOAT      = 31,
+CFMT_10_10_10_2_UNORM    = 32,
+CFMT_10_10_10_2_SNORM    = 33,
+CFMT_10_10_10_2_UINT     = 34,
+CFMT_10_10_10_2_SINT     = 35,
+CFMT_2_10_10_10_UNORM    = 36,
+CFMT_2_10_10_10_SNORM    = 37,
+CFMT_2_10_10_10_USCALED  = 38,
+CFMT_2_10_10_10_SSCALED  = 39,
+CFMT_2_10_10_10_UINT     = 40,
+CFMT_2_10_10_10_SINT     = 41,
+CFMT_8_8_8_8_UNORM       = 42,
+CFMT_8_8_8_8_SNORM       = 43,
+CFMT_8_8_8_8_USCALED     = 44,
+CFMT_8_8_8_8_SSCALED     = 45,
+CFMT_8_8_8_8_UINT        = 46,
+CFMT_8_8_8_8_SINT        = 47,
+CFMT_32_32_UINT          = 48,
+CFMT_32_32_SINT          = 49,
+CFMT_32_32_FLOAT         = 50,
+CFMT_16_16_16_16_UNORM   = 51,
+CFMT_16_16_16_16_SNORM   = 52,
+CFMT_16_16_16_16_USCALED = 53,
+CFMT_16_16_16_16_SSCALED = 54,
+CFMT_16_16_16_16_UINT    = 55,
+CFMT_16_16_16_16_SINT    = 56,
+CFMT_16_16_16_16_FLOAT   = 57,
+CFMT_32_32_32_UINT       = 58,
+CFMT_32_32_32_SINT       = 59,
+CFMT_32_32_32_FLOAT      = 60,
+CFMT_32_32_32_32_UINT    = 61,
+CFMT_32_32_32_32_SINT    = 62,
+CFMT_32_32_32_32_FLOAT   = 63,
+CFMT_8_SRGB              = 64,
+CFMT_8_8_SRGB            = 65,
+CFMT_8_8_8_8_SRGB        = 66,
+CFMT_5_9_9_9_FLOAT       = 67,
+CFMT_5_6_5_UNORM         = 68,
+CFMT_1_5_5_5_UNORM       = 69,
+CFMT_5_5_5_1_UNORM       = 70,
+CFMT_4_4_4_4_UNORM       = 71,
+CFMT_4_4_UNORM           = 72,
+CFMT_1_UNORM             = 73,
+CFMT_1_REVERSED_UNORM    = 74,
+CFMT_32_FLOAT_CLAMP      = 75,
+CFMT_8_24_UNORM          = 76,
+CFMT_8_24_UINT           = 77,
+CFMT_24_8_UNORM          = 78,
+CFMT_24_8_UINT           = 79,
+CFMT_X24_8_32_UINT       = 80,
+CFMT_X24_8_32_FLOAT      = 81,
+};
+
+typedef enum SEL {
+  SEL_0 = 0x00000000,
+  SEL_1 = 0x00000001,
+  SEL_X = 0x00000004,
+  SEL_Y = 0x00000005,
+  SEL_Z = 0x00000006,
+  SEL_W = 0x00000007,
+} SEL;
+
+typedef enum SQ_RSRC_IMG_TYPE {
+  SQ_RSRC_IMG_1D            = 0x00000008,
+  SQ_RSRC_IMG_2D            = 0x00000009,
+  SQ_RSRC_IMG_3D            = 0x0000000a,
+  SQ_RSRC_IMG_CUBE_ARRAY    = 0x0000000b,
+  SQ_RSRC_IMG_1D_ARRAY      = 0x0000000c,
+  SQ_RSRC_IMG_2D_ARRAY      = 0x0000000d,
+  SQ_RSRC_IMG_2D_MSAA       = 0x0000000e,
+  SQ_RSRC_IMG_2D_MSAA_ARRAY = 0x0000000f,
+} SQ_RSRC_IMG_TYPE;
+
+typedef enum SQ_TEX_XY_FILTER {
+  SQ_TEX_XY_FILTER_POINT          = 0x00000000,
+  SQ_TEX_XY_FILTER_BILINEAR       = 0x00000001,
+  SQ_TEX_XY_FILTER_ANISO_POINT    = 0x00000002,
+  SQ_TEX_XY_FILTER_ANISO_BILINEAR = 0x00000003,
+} SQ_TEX_XY_FILTER;
+
+typedef enum SQ_TEX_Z_FILTER {
+  SQ_TEX_Z_FILTER_NONE   = 0x00000000,
+  SQ_TEX_Z_FILTER_POINT  = 0x00000001,
+  SQ_TEX_Z_FILTER_LINEAR = 0x00000002,
+} SQ_TEX_Z_FILTER;
+
+typedef enum SQ_TEX_MIP_FILTER {
+  SQ_TEX_MIP_FILTER_NONE                = 0x00000000,
+  SQ_TEX_MIP_FILTER_POINT               = 0x00000001,
+  SQ_TEX_MIP_FILTER_LINEAR              = 0x00000002,
+  SQ_TEX_MIP_FILTER_POINT_ANISO_ADJ__VI = 0x00000003,
+} SQ_TEX_MIP_FILTER;
+
+typedef enum SQ_TEX_CLAMP {
+  SQ_TEX_WRAP                    = 0x00000000,
+  SQ_TEX_MIRROR                  = 0x00000001,
+  SQ_TEX_CLAMP_LAST_TEXEL        = 0x00000002,
+  SQ_TEX_MIRROR_ONCE_LAST_TEXEL  = 0x00000003,
+  SQ_TEX_CLAMP_HALF_BORDER       = 0x00000004,
+  SQ_TEX_MIRROR_ONCE_HALF_BORDER = 0x00000005,
+  SQ_TEX_CLAMP_BORDER            = 0x00000006,
+  SQ_TEX_MIRROR_ONCE_BORDER      = 0x00000007,
+} SQ_TEX_CLAMP;
+
+typedef enum SQ_TEX_BORDER_COLOR {
+  SQ_TEX_BORDER_COLOR_TRANS_BLACK  = 0x00000000,
+  SQ_TEX_BORDER_COLOR_OPAQUE_BLACK = 0x00000001,
+  SQ_TEX_BORDER_COLOR_OPAQUE_WHITE = 0x00000002,
+  SQ_TEX_BORDER_COLOR_REGISTER     = 0x00000003,
+} SQ_TEX_BORDER_COLOR;
+
+typedef enum TEX_BC_SWIZZLE {
+TEX_BC_Swizzle_XYZW = 0x00000000,
+TEX_BC_Swizzle_XWYZ = 0x00000001,
+TEX_BC_Swizzle_WZYX = 0x00000002,
+TEX_BC_Swizzle_WXYZ = 0x00000003,
+TEX_BC_Swizzle_ZYXW = 0x00000004,
+TEX_BC_Swizzle_YXWZ = 0x00000005,
+} TEX_BC_SWIZZLE;
+
+typedef struct metadata_amd_gfx12_s {
+  uint32_t version;   // Must be 1
+  uint32_t vendorID;  // AMD
+  SQ_IMG_RSRC_WORD0 word0;
+  SQ_IMG_RSRC_WORD1 word1;
+  SQ_IMG_RSRC_WORD2 word2;
+  SQ_IMG_RSRC_WORD3 word3;
+  SQ_IMG_RSRC_WORD4 word4;
+  SQ_IMG_RSRC_WORD5 word5;
+  SQ_IMG_RSRC_WORD6 word6;
+  SQ_IMG_RSRC_WORD7 word7;
+  uint32_t mip_offsets[0];
+} metadata_amd_gfx12_t;
+
+}  // namespace image
+}  // namespace rocr
+#endif  // EXT_IMAGE_RESOURCE_GFX12_H_
+
diff --git a/src/image/util.h b/src/image/util.h
index 8482e41a4..88cdf4ccc 100644
--- a/src/image/util.h
+++ b/src/image/util.h
@@ -99,7 +99,7 @@ static __forceinline void* _aligned_malloc(size_t size, size_t alignment) {
   return aligned_alloc(alignment, size);
 #else
   void* mem = NULL;
-  if (NULL != posix_memalign(&mem, alignment, size)) return NULL;
+  if (0 != posix_memalign(&mem, alignment, size)) return NULL;
   return mem;
 #endif
 }
diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h
index 51aa389a0..74f15d7d7 100644
--- a/src/inc/amd_hsa_elf.h
+++ b/src/inc/amd_hsa_elf.h
@@ -75,7 +75,8 @@ enum {
   ELFABIVERSION_AMDGPU_HSA_V2 = 0,
   ELFABIVERSION_AMDGPU_HSA_V3 = 1,
   ELFABIVERSION_AMDGPU_HSA_V4 = 2,
-  ELFABIVERSION_AMDGPU_HSA_V5 = 3
+  ELFABIVERSION_AMDGPU_HSA_V5 = 3,
+  ELFABIVERSION_AMDGPU_HSA_V6 = 4,
 };
 
 // AMDGPU specific e_flags.
@@ -87,6 +88,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_NONE = 0x000,
 
   // AMDGCN-based processors.
+  // clang-format off
   EF_AMDGPU_MACH_AMDGCN_GFX600        = 0x020,
   EF_AMDGPU_MACH_AMDGCN_GFX601        = 0x021,
   EF_AMDGPU_MACH_AMDGCN_GFX700        = 0x022,
@@ -127,13 +129,25 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX1036       = 0x045,
   EF_AMDGPU_MACH_AMDGCN_GFX1101       = 0x046,
   EF_AMDGPU_MACH_AMDGCN_GFX1102       = 0x047,
+  EF_AMDGPU_MACH_AMDGCN_GFX1200       = 0x048,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049,
   EF_AMDGPU_MACH_AMDGCN_GFX1151       = 0x04a,
   EF_AMDGPU_MACH_AMDGCN_GFX941        = 0x04b,
   EF_AMDGPU_MACH_AMDGCN_GFX942        = 0x04c,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1201       = 0x04e,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC      = 0x051,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC   = 0x052,
+  EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC   = 0x053,
+  EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC     = 0x054,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
+  // clang-format on
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX942,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
@@ -159,8 +173,7 @@ enum : unsigned {
 
   // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values.
   //
-  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4,
-  // ELFABIVERSION_AMDGPU_HSA_V5.
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
   EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
   // XNACK is not supported.
   EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
@@ -173,8 +186,7 @@ enum : unsigned {
 
   // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
   //
-  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4,
-  // ELFABIVERSION_AMDGPU_HSA_V5.
+  // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
   EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
   // SRAMECC is not supported.
   EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
@@ -184,6 +196,21 @@ enum : unsigned {
   EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
   // SRAMECC is on.
   EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
+
+  // Generic target versioning. This is contained in the list byte of EFLAGS.
+  EF_AMDGPU_GENERIC_VERSION = 0xff000000,
+  EF_AMDGPU_GENERIC_VERSION_OFFSET = 24,
+  EF_AMDGPU_GENERIC_VERSION_MIN = 1,
+  EF_AMDGPU_GENERIC_VERSION_MAX = 0xff,
+};
+
+// ELF Relocation types for AMDGPU.
+enum : unsigned {
+  R_AMDGPU_ABS32_LO = 1,
+  R_AMDGPU_ABS32_HI = 2,
+  R_AMDGPU_ABS64 = 3,
+  R_AMDGPU_ABS32 = 6,
+  R_AMDGPU_RELATIVE64 = 13,
 };
 
 } // end namespace ELF
@@ -245,14 +272,14 @@ typedef enum {
 // ELF Symbol Flag Enumeration Values.
 #define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST
 
-// AMD GPU Relocation Type Enumeration Values.
-#define R_AMDGPU_NONE         0
-#define R_AMDGPU_32_LOW       1
-#define R_AMDGPU_32_HIGH      2
-#define R_AMDGPU_64           3
-#define R_AMDGPU_INIT_SAMPLER 4
-#define R_AMDGPU_INIT_IMAGE   5
-#define R_AMDGPU_RELATIVE64   13
+// Legacy/V1 AMD GPU Relocation Type Enumeration Values.
+#define R_AMDGPU_V1_NONE         0
+#define R_AMDGPU_V1_32_LOW       1
+#define R_AMDGPU_V1_32_HIGH      2
+#define R_AMDGPU_V1_64           3
+#define R_AMDGPU_V1_INIT_SAMPLER 4
+#define R_AMDGPU_V1_INIT_IMAGE   5
+#define R_AMDGPU_V1_RELATIVE64   13
 
 // AMD GPU Note Type Enumeration Values.
 #define NT_AMD_HSA_CODE_OBJECT_VERSION 1
diff --git a/src/inc/hsa.h b/src/inc/hsa.h
index 9520bd870..1ad714c44 100644
--- a/src/inc/hsa.h
+++ b/src/inc/hsa.h
@@ -598,10 +598,14 @@ typedef enum {
    * AqlProfile extension.
    */
   HSA_EXTENSION_AMD_AQLPROFILE = 0x202,
+  /**
+   * PC Sampling extension.
+   */
+  HSA_EXTENSION_AMD_PC_SAMPLING = 0x203,
   /**
    * Last AMD extension.
    */
-  HSA_AMD_LAST_EXTENSION = 0x202
+  HSA_AMD_LAST_EXTENSION = 0x203
 } hsa_extension_t;
 
 /**
@@ -5656,7 +5660,12 @@ typedef enum {
    * undefined if the symbol is not an indirect function. The type of this
    * attribute is uint32_t.
    */
-  HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
+  HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16,
+  /**
+   * Wavefront size used by the kernel. The value of this attribute is either
+   * 32 or 64. The type of this attribute is uint32_t.
+   */
+  HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19
 } hsa_code_symbol_info_t;
 
 /**
diff --git a/src/inc/hsa_api_trace.h b/src/inc/hsa_api_trace.h
index e46c777af..2a0f59df3 100644
--- a/src/inc/hsa_api_trace.h
+++ b/src/inc/hsa_api_trace.h
@@ -44,39 +44,26 @@
 #define HSA_RUNTIME_INC_HSA_API_TRACE_H
 
 #include "hsa.h"
+#include "hsa_api_trace_version.h"
 #ifdef AMD_INTERNAL_BUILD
 #include "hsa_ext_image.h"
 #include "hsa_ext_amd.h"
 #include "hsa_ext_finalize.h"
 #include "hsa_amd_tool.h"
+#include "hsa_ven_amd_pc_sampling.h"
 #else
 #include "inc/hsa_ext_image.h"
 #include "inc/hsa_ext_amd.h"
 #include "inc/hsa_ext_finalize.h"
 #include "inc/hsa_amd_tool.h"
+#include "inc/hsa_ven_amd_pc_sampling.h"
 #endif
 
 #include <string.h>
 #include <assert.h>
 #include <stddef.h>
 
-// Major Ids of the Api tables exported by Hsa Core Runtime
-#define HSA_API_TABLE_MAJOR_VERSION               0x03
-#define HSA_CORE_API_TABLE_MAJOR_VERSION          0x02
-#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION       0x02
-#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION     0x02
-#define HSA_IMAGE_API_TABLE_MAJOR_VERSION         0x02
-#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION    0x01
-#define HSA_TOOLS_API_TABLE_MAJOR_VERSION         0x01
-
-// Step Ids of the Api tables exported by Hsa Core Runtime
-#define HSA_API_TABLE_STEP_VERSION                0x00
-#define HSA_CORE_API_TABLE_STEP_VERSION           0x00
-#define HSA_AMD_EXT_API_TABLE_STEP_VERSION        0x01
-#define HSA_FINALIZER_API_TABLE_STEP_VERSION      0x00
-#define HSA_IMAGE_API_TABLE_STEP_VERSION          0x00
-#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION     0x00
-#define HSA_TOOLS_API_TABLE_STEP_VERSION          0x00
+// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h
 
 // Min function used to copy Api Tables
 static inline uint32_t Min(const uint32_t a, const uint32_t b) {
@@ -191,6 +178,19 @@ struct ImageExtTable {
   decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn;
 };
 
+// Table to export HSA PC Sampling Extension Apis
+struct PcSamplingExtTable {
+  ApiTableVersion version;
+  decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn;
+  decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn;
+  decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn;
+  decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn;
+  decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn;
+  decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn;
+  decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn;
+};
+
+
 // Table to export AMD Extension Apis
 struct AmdExtTable {
   ApiTableVersion version;
@@ -263,6 +263,8 @@ struct AmdExtTable {
   decltype(hsa_amd_vmem_get_alloc_properties_from_handle)*
       hsa_amd_vmem_get_alloc_properties_from_handle_fn;
   decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn;
+  decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn;
+  decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn;
 };
 
 // Table to export HSA Core Runtime Apis
@@ -464,6 +466,9 @@ struct HsaApiTable {
 
   // Table of function pointers for tools to use
   ToolsApiTable* tools_;
+
+  // Table of function pointers to AMD PC Sampling Extension
+  PcSamplingExtTable* pc_sampling_ext_;
 };
 
 // Structure containing instances of different api tables
@@ -474,6 +479,7 @@ struct HsaApiTableContainer {
 	FinalizerExtTable finalizer_ext;
 	ImageExtTable image_ext;
 	ToolsApiTable tools;
+  PcSamplingExtTable pc_sampling_ext;
 
   // Default initialization of a container instance
   HsaApiTableContainer() {
@@ -505,6 +511,11 @@ struct HsaApiTableContainer {
     tools.version.minor_id = sizeof(ToolsApiTable);
     tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION;
     root.tools_ = &tools;
+
+    pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION;
+    pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable);
+    pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION;
+    root.pc_sampling_ext_ = &pc_sampling_ext;
   }
 };
 
@@ -562,5 +573,7 @@ static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) {
     copyElement(&dest->image_ext_->version, &src->image_ext_->version);
   if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id))
     copyElement(&dest->tools_->version, &src->tools_->version);
+  if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id))
+    copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version);
 }
 #endif
diff --git a/src/inc/hsa_api_trace_version.h b/src/inc/hsa_api_trace_version.h
new file mode 100644
index 000000000..3393a7762
--- /dev/null
+++ b/src/inc/hsa_api_trace_version.h
@@ -0,0 +1,68 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
+#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
+
+// CODE IN THIS FILE **MUST** BE C-COMPATIBLE
+
+// Major Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_MAJOR_VERSION                 0x03
+#define HSA_CORE_API_TABLE_MAJOR_VERSION            0x02
+#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION         0x02
+#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION       0x02
+#define HSA_IMAGE_API_TABLE_MAJOR_VERSION           0x02
+#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION      0x01
+#define HSA_TOOLS_API_TABLE_MAJOR_VERSION           0x01
+#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION     0x01
+
+// Step Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_STEP_VERSION                  0x01
+#define HSA_CORE_API_TABLE_STEP_VERSION             0x00
+#define HSA_AMD_EXT_API_TABLE_STEP_VERSION          0x03
+#define HSA_FINALIZER_API_TABLE_STEP_VERSION        0x00
+#define HSA_IMAGE_API_TABLE_STEP_VERSION            0x00
+#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION       0x00
+#define HSA_TOOLS_API_TABLE_STEP_VERSION            0x00
+#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION      0x00
+
+#endif  // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h
index 187bcd958..f9f60edeb 100644
--- a/src/inc/hsa_ext_amd.h
+++ b/src/inc/hsa_ext_amd.h
@@ -47,16 +47,19 @@
 
 #include "hsa.h"
 #include "hsa_ext_image.h"
+#include "hsa_ven_amd_pc_sampling.h"
 
-/*
+/**
  * - 1.0 - initial version
  * - 1.1 - dmabuf export
  * - 1.2 - hsa_amd_memory_async_copy_on_engine
  * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool
  * - 1.4 - Virtual Memory API
+ * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES
+ * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align
  */
 #define HSA_AMD_INTERFACE_VERSION_MAJOR 1
-#define HSA_AMD_INTERFACE_VERSION_MINOR 4
+#define HSA_AMD_INTERFACE_VERSION_MINOR 6
 
 #ifdef __cplusplus
 extern "C" {
@@ -221,6 +224,11 @@ enum {
    * Exceeded number of VGPRs available on this agent
    */
   HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45,
+
+  /**
+   * Resource is busy or temporarily unavailable
+   */
+  HSA_STATUS_ERROR_RESOURCE_BUSY = 46,
 };
 
 /**
@@ -1176,7 +1184,11 @@ typedef enum hsa_amd_memory_pool_flag_s {
    * connection. Atomic memory operations on these memory buffers are not
    * guaranteed to be visible at system scope.
    */
-  HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1,
+  HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0),
+  /**
+   *  Allocates physically contiguous memory
+   */
+  HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1),
 
 } hsa_amd_memory_pool_flag_t;
 
@@ -2783,7 +2795,7 @@ hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* d
  */
 hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
 
-/*
+/**
  * @brief Allocate a reserved address range
  *
  * Reserve a virtual address range. The size must be a multiple of the system page size.
@@ -2803,11 +2815,39 @@ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
  *
  * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
  * range of this size.
+ *
+ * Note that this API will be deprecated in a future release and replaced by
+ * hsa_amd_vmem_address_reserve_align
  */
 hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address,
                                           uint64_t flags);
 
-/*
+/**
+ * @brief Allocate a reserved address range
+ *
+ * Reserve a virtual address range. The size must be a multiple of the system page size.
+ * If it is not possible to allocate the address specified by @p address, then @p va will be
+ * a different address range.
+ * Address range should be released by calling hsa_amd_vmem_address_free.
+ *
+ * @param[out] va virtual address allocated
+ * @param[in] size of address range requested
+ * @param[in] address requested
+ * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2
+ * @param[in] flags currently unsupported
+ *
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
+ * range of this size.
+ */
+hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address,
+                                          uint64_t alignment, uint64_t flags);
+
+/**
  * @brief Free a reserved address range
  *
  * Free a previously allocated address range. The size must match the size of a previously
@@ -2841,7 +2881,7 @@ typedef enum {
   MEMORY_TYPE_PINNED,
 } hsa_amd_memory_type_t;
 
-/*
+/**
  * @brief Create a virtual memory handle
  *
  * Create a virtual memory handle within this pool
@@ -2870,7 +2910,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
                                         hsa_amd_memory_type_t type, uint64_t flags,
                                         hsa_amd_vmem_alloc_handle_t* memory_handle);
 
-/*
+/**
  * @brief Release a virtual memory handle
  *
  * @param[in] memory handle that was previously allocated
@@ -2881,7 +2921,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
  */
 hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle);
 
-/*
+/**
  * @brief Map a virtual memory handle
  *
  * Map a virtual memory handle to a reserved address range. The virtual address requested must be
@@ -2907,7 +2947,7 @@ hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_hand
 hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset,
                               hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags);
 
-/*
+/**
  * @brief Unmap a virtual memory handle
  *
  * Unmap previously mapped virtual address range
@@ -2930,7 +2970,7 @@ typedef struct hsa_amd_memory_access_desc_s {
   hsa_agent_t agent_handle;
 } hsa_amd_memory_access_desc_t;
 
-/*
+/**
  * @brief Make a memory mapping accessible
  *
  * Make previously mapped virtual address accessible to specific agents. @p size must be equal to
@@ -2959,7 +2999,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size,
                                      const hsa_amd_memory_access_desc_t* desc,
                                      size_t desc_cnt);
 
-/*
+/**
  * @brief Get current access permissions for memory mapping
  *
  * Get access permissions for memory mapping for specific agent.
@@ -2980,7 +3020,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size,
 hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms,
                                      hsa_agent_t agent_handle);
 
-/*
+/**
  * @brief Get an exportable shareable handle
  *
  * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to
@@ -3003,7 +3043,7 @@ hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms,
 hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd,
                                                   hsa_amd_vmem_alloc_handle_t handle,
                                                   uint64_t flags);
-/*
+/**
  * @brief Import a shareable handle
  *
  * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed
@@ -3023,7 +3063,7 @@ hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd,
 hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd,
                                                   hsa_amd_vmem_alloc_handle_t* handle);
 
-/*
+/**
  * @brief Returns memory handle for mapped memory
  *
  * Return a memory handle for previously mapped memory. The handle will be the same value of handle
@@ -3040,19 +3080,19 @@ hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd,
 hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle,
                                               void* addr);
 
-/*
-* @brief Returns the current allocation properties of a handle
-*
-* Returns the allocation properties of an existing handle
-*
-* @param[in] memory_handle memory handle to be queried
-* @param[out] pool memory pool that owns this handle
-* @param[out] memory type
-
-* @retval ::HSA_STATUS_SUCCESS
-*
-* @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle
-*/
+/**
+ * @brief Returns the current allocation properties of a handle
+ *
+ * Returns the allocation properties of an existing handle
+ *
+ * @param[in] memory_handle memory handle to be queried
+ * @param[out] pool memory pool that owns this handle
+ * @param[out] memory type
+
+ * @retval ::HSA_STATUS_SUCCESS
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle
+ */
 hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(
     hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool,
     hsa_amd_memory_type_t* type);
@@ -3084,6 +3124,22 @@ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(
  */
 hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold);
 
+typedef enum {
+  /*
+   * Returns the agent that owns the underlying HW queue.
+   * The type of this attribute is hsa_agent_t.
+   */
+  HSA_AMD_QUEUE_INFO_AGENT,
+  /*
+   * Returns the doorbell ID of the completion signal of the queue
+   * The type of this attribute is uint64_t.
+   */
+  HSA_AMD_QUEUE_INFO_DOORBELL_ID,
+} hsa_queue_info_attribute_t;
+
+hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
+                                    void* value);
+
 #ifdef __cplusplus
 }  // end extern "C" block
 #endif
diff --git a/src/inc/hsa_ven_amd_aqlprofile.h b/src/inc/hsa_ven_amd_aqlprofile.h
index 32ca6b732..0022c0d8b 100644
--- a/src/inc/hsa_ven_amd_aqlprofile.h
+++ b/src/inc/hsa_ven_amd_aqlprofile.h
@@ -149,61 +149,61 @@ hsa_status_t hsa_ven_amd_aqlprofile_validate_event(
 // All parameters are generic and if not applicable for a specific
 // profile configuration then error status will be returned.
 typedef enum {
-  /*
-  * Select the target compute unit (wgp) for profiling.
-  */
+  /**
+   * Select the target compute unit (wgp) for profiling.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0,
-  /*
-  * VMID Mask
-  */
+  /**
+   * VMID Mask
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1,
-  /*
-  * Legacy. Deprecated.
-  */
+  /**
+   * Legacy. Deprecated.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2,
-  /*
-  * Legacy. Deprecated.
-  */
+  /**
+   * Legacy. Deprecated.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3,
-  /*
-  * Legacy. Deprecated.
-  */
+  /**
+   * Legacy. Deprecated.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4,
-  /*
-  * Shader engine mask for selection.
-  */
+  /**
+   * Shader engine mask for selection.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5,
-  /*
-  * Legacy. Deprecated.
-  */
+  /**
+   * Legacy. Deprecated.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6,
-  /*
-  * Legacy. Deprecated.
-  */
+  /**
+   * Legacy. Deprecated.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7,
-  /*
-  * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi)
-  */
+  /**
+   * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi)
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8,
-  /*
-  * Set true for occupancy collection only.
-  */
+  /**
+   * Set true for occupancy collection only.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9,
-  /*
-  * ATT collection max data size, in MB. Shared among shader engines.
-  */
+  /**
+   * ATT collection max data size, in MB. Shared among shader engines.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10,
-  /*
-  * Mask of which compute units to generate perfcounters. GFX9 only.
-  */
+  /**
+   * Mask of which compute units to generate perfcounters. GFX9 only.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240,
-  /*
-  * Select collection period for perfcounters. GFX9 only.
-  */
+  /**
+   * Select collection period for perfcounters. GFX9 only.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241,
-  /*
-  * Select perfcounter ID (SQ block) for collection. GFX9 only.
-  */
+  /**
+   * Select perfcounter ID (SQ block) for collection. GFX9 only.
+   */
   HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242,
 } hsa_ven_amd_aqlprofile_parameter_name_t;
 
@@ -365,11 +365,11 @@ hsa_status_t hsa_ven_amd_aqlprofile_error_string(
 
 /**
  * @brief Callback for iteration of all possible event coordinate IDs and coordinate names.
-*/
+ */
 typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name);
 /**
  * @brief Iterate over all possible event coordinate IDs and their names.
-*/
+ */
 hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t);
 
 /**
@@ -380,7 +380,7 @@ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eve
  * @param coordinate The coordinate, in the range [0,extent-1].
  * @param name Coordinate name as in _iterate_event_ids.
  * @param userdata Userdata returned from _iterate_event_coord function.
-*/
+ */
 typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)(
   int position,
   int id,
@@ -397,7 +397,7 @@ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)(
  * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data.
  * @param[in] callback Callback function to return the coordinates.
  * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
-*/
+ */
 hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord(
   hsa_agent_t agent,
   hsa_ven_amd_aqlprofile_event_t event,
diff --git a/src/inc/hsa_ven_amd_pc_sampling.h b/src/inc/hsa_ven_amd_pc_sampling.h
new file mode 100644
index 000000000..019f0ea5c
--- /dev/null
+++ b/src/inc/hsa_ven_amd_pc_sampling.h
@@ -0,0 +1,416 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_VEN_AMD_PC_SAMPLING_H
+#define HSA_VEN_AMD_PC_SAMPLING_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+
+/**
+ * @brief HSA AMD Vendor PC Sampling APIs
+ * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be
+ * modified extensively in the future
+ */
+
+/**
+ * @brief PC Sampling sample data for hosttrap sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t reserved0;
+  uint64_t reserved1;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_hosttrap_v1_t;
+
+/**
+ * @brief PC Sampling sample data for stochastic sampling method
+ */
+typedef struct {
+  uint64_t pc;
+  uint64_t exec_mask;
+  uint32_t workgroup_id_x;
+  uint32_t workgroup_id_y;
+  uint32_t workgroup_id_z;
+  uint32_t wave_in_wg : 6;
+  uint32_t chiplet    : 3;   // Currently not used
+  uint32_t reserved   : 23;
+  uint32_t hw_id;
+  uint32_t perf_snapshot_data;
+  uint32_t perf_snapshot_data1;
+  uint32_t perf_snapshot_data2;
+  uint64_t timestamp;
+  uint64_t correlation_id;
+} perf_sample_snapshot_v1_t;
+
+/**
+ * @brief PC Sampling method kinds
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1,
+  HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1
+} hsa_ven_amd_pcs_method_kind_t;
+
+/**
+ * @brief PC Sampling interval unit type
+ */
+typedef enum {
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES,
+  HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS
+} hsa_ven_amd_pcs_units_t;
+
+/**
+ * @brief HSA callback function to perform the copy onto a destination buffer
+ *
+ * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal
+ * buffers. Remaining contents of HSA internal buffers will be included in next
+ * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling
+ * hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] hsa_callback_data private data to pass back to HSA. Provided in
+ * hsa_ven_amd_pcs_data_ready_callback_t
+ *
+ * @param[in] data_size size of destination buffer in bytes.
+ * @param[in] destination destination buffer
+ * @retval    TBD: but could be used to indicate that there is no more data to be read.
+ * Or indicate an error and abort of current copy operations
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data,
+                                                             size_t data_size, void* destination);
+
+/**
+ * @brief HSA callback function to to indicate that there is data ready to be copied
+ *
+ * When the client receives this callback, the client should call back @p data_copy_callback for HSA
+ * to perform the copy operation into an available buffer. @p data_copy_callback can be called back
+ * multiple times with smaller @p data_size to split the copy operation.
+ *
+ * This callback must not call ::hsa_ven_amd_pcs_flush.
+ *
+ * @param[in] client_callback_data client private data passed in via
+ * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id
+ * @param[in] data_size size of data available to be copied
+ * @param[in] lost_sample_count number of lost samples since last call to
+ * hsa_ven_amd_pcs_data_ready_callback_t.
+ * @param[in] data_copy_callback callback function for HSA to perform the actual copy
+ * @param[in] hsa_callback_data private data to pass back to HSA
+ */
+typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)(
+    void* client_callback_data, size_t data_size, size_t lost_sample_count,
+    hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data);
+
+/**
+ * @brief Opaque handle representing a sampling session.
+ * Two sessions having same handle value represent the same session
+ */
+typedef struct {
+  uint64_t handle;
+} hsa_ven_amd_pcs_t;
+
+/**
+ * @brief PC Sampling configuration flag options
+ */
+typedef enum {
+  /* The interval for this sampling method have to be a power of 2 */
+  HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0)
+} hsa_ven_amd_pcs_configuration_flags_t;
+
+/**
+ * @brief PC Sampling method information
+ * Used to provide client with list of supported PC Sampling methods
+ */
+typedef struct {
+  hsa_ven_amd_pcs_method_kind_t method;
+  hsa_ven_amd_pcs_units_t units;
+  size_t min_interval;
+  size_t max_interval;
+  uint64_t flags;
+} hsa_ven_amd_pcs_configuration_t;
+
+/**
+ * @brief Callback function to iterate through list of supported PC Sampling configurations
+ *
+ * @param[in] configuration one entry for supported PC Sampling method and configuration options
+ * @param[in] callback_data client private callback data that was passed in when calling
+ * hsa_ven_amd_pcs_iterate_configuration
+ */
+typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)(
+    const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data);
+
+/**
+ * @brief Iterate through list of current supported PC Sampling configurations for this @p agent
+ *
+ * HSA will callback @p configuration_callback for each currently available PC Sampling
+ * configuration. The list of currently available configurations may not be the complete list of
+ * configurations supported on the @p agent. The list of currently available configurations may be
+ * reduced if the @p agent is currently handling other PC sampling sessions.
+ *
+ * @param[in] agent target agent
+ * @param[in] configuration_callback callback function to iterate through list of configurations
+ * @param[in] callback_data client private callback data
+ **/
+hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data);
+
+/**
+ * @brief  Create a PC Sampling session on @p agent
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, as described by the
+ * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of
+ * hsa_ven_amd_pcs_iterate_configuration for this @p agent.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                    size_t buffer_size,
+                                    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                    void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling);
+
+
+/**
+ * @brief  Creates a PC Sampling session on @p agent. Assumes that the caller provides the
+ * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling
+ * on the @p agent.
+ *
+ * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing
+ * PC sampling session that was previously created in the underlying driver.
+ *
+ * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval
+ * parameters must be a legal configuration value, and match the parameters that we used to create
+ * the underlying PC Sampling session in the underlying driver.
+ * A successfull call may restrict the list of possible PC sampling methods available to subsequent
+ * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations
+ * on what types of PC sampling they can perform concurrently.
+ * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session.
+ * The session will be in a stopped/inactive state after this call
+ *
+ * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver
+ * @param[in] agent target agent
+ * @param[in] method method to use
+ * @param[in] units sampling units
+ * @param[in] interval sampling interval in @p units
+ * @param[in] latency expected latency in microseconds for client to provide a buffer for the data
+ * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the
+ * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate
+ * how many samples are received within @p latency and call @p data_ready_callback ahead of time so
+ * that the client has @p latency time to allocate the buffer before the HSA-runtime internal
+ * buffers are full. The value of latency can be 0.
+ * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once
+ * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of
+ * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t.
+ * @param[in] data_ready_callback client callback function that will be called when:
+ *   1. There is enough samples fill a buffer with @p buffer_size  - estimated samples received
+ *      within @p latency period.
+ * OR
+ *   2. When hsa_ven_amd_pcs_flush is called.
+ * @param[in] client_callback_data client private data to be provided back when data_ready_callback
+ * is called.
+ * @param[out] pc_sampling PC sampling session handle used to reference this session when calling
+ * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy
+ *
+ * @retval ::HSA_STATUS_SUCCESS session created successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters
+ * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and
+ * cannot handle the type requested.
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources
+ * @retval ::HSA_STATUS_ERROR Unexpected error
+ **/
+hsa_status_t hsa_ven_amd_pcs_create_from_id(
+    uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling);
+
+/**
+ * @brief  Free a PC Sampling session on @p agent
+ *
+ * Free all the resources allocated for a PC Sampling session on @p agent
+ * Internal buffers for this session will be lost.
+ * If the session was active, the session will be stopped before it is destroyed.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Start a PC Sampling session
+ *
+ * Activate a PC Sampling session that was previous created.
+ * The session with be in a active state after this call
+ * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session started successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ * @retval ::HSA_STATUS_ERROR unexpected error
+ */
+hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Stop a PC Sampling session
+ *
+ * Stop a session that is currently active
+ * After a session is stopped HSA may still have some PC Sampling data in its internal buffers.
+ * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal
+ * buffers are not drained and the session is started again, the internal buffers will be available
+ * on the next data_ready_callback.
+ * If the session was already inactive, this will result in a no-op and will return
+ * HSA_STATUS_SUCCESS
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session stopped successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling);
+
+/**
+ * @brief  Flush internal buffers for a PC Sampling session
+ *
+ * Drain internal buffers for a PC Sampling session. If internal buffers have available data,
+ * this trigger a data_ready_callback.
+ *
+ * The function blocks until all PC samples associated with the @p pc_sampling session
+ * generated prior to the function call have been communicated by invocations of
+ * @p data_ready_callback having completed execution.
+ *
+ * @param[in] pc_sampling PC sampling session handle
+ *
+ * @retval ::HSA_STATUS_SUCCESS Session flushed successfully
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle
+ */
+hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling);
+
+#define hsa_ven_amd_pc_sampling_1_00
+
+/**
+ * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by
+ * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t {
+  hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)(
+      hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+      void* callback_data);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                         hsa_ven_amd_pcs_units_t units, size_t interval,
+                                         size_t latency, size_t buffer_size,
+                                         hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                         void* client_callback_data,
+                                         hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_create_from_id)(
+      uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+      hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+      hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+      hsa_ven_amd_pcs_t* pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling);
+
+  hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling);
+
+} hsa_ven_amd_pc_sampling_1_00_pfn_t;
+
+#ifdef __cplusplus
+}  // end extern "C" block
+#endif /*__cplusplus*/
+
+#endif /* HSA_VEN_AMD_PC_SAMPLING_H */
diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp
index 08836a577..ff70e61bd 100644
--- a/src/libamdhsacode/amd_hsa_code.cpp
+++ b/src/libamdhsacode/amd_hsa_code.cpp
@@ -483,6 +483,10 @@ namespace code {
         *major = 5;
         *minor = 0;
         return true;
+      case ELF::ELFABIVERSION_AMDGPU_HSA_V6:
+        *major = 6;
+        *minor = 0;
+        return true;
       }
 
       return false;
@@ -600,6 +604,12 @@ namespace code {
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: MI.Name = "gfx1103"; MI.XnackSupported = false; MI.SrameccSupported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: MI.Name = "gfx1150"; MI.XnackSupported = false; MI.SrameccSupported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: MI.Name = "gfx1151"; MI.XnackSupported = false; MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:    MI.Name = "gfx9-generic";    MI.XnackSupported = true; MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: MI.Name = "gfx10-1-generic"; MI.XnackSupported = true; MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC: MI.Name = "gfx10-3-generic"; MI.XnackSupported = false; MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC:   MI.Name = "gfx11-generic";   MI.XnackSupported = false; MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: MI.Name = "gfx1200"; MI.XnackSupported = false; MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: MI.Name = "gfx1201"; MI.XnackSupported = false; MI.SrameccSupported = false; break;
       default: return false;
       }
       return true;
@@ -687,13 +697,17 @@ namespace code {
       return MI.Name;
     }
 
-    bool AmdHsaCode::GetIsa(std::string& isa_name)
+    bool AmdHsaCode::GetIsa(std::string& isa_name, unsigned *genericVersion)
     {
       isa_name.clear();
 
       uint32_t code_object_major_version = 0;
       uint32_t code_object_minor_version = 0;
 
+      // Generic versioning starts at 1, so zero means no generic version.
+      if (genericVersion)
+        *genericVersion = 0;
+
       switch (img->EClass()) {
       case ELFCLASS64:
         // There is no e_machine and/or OS ABI for R600 so rely on checking
@@ -740,7 +754,7 @@ namespace code {
             MI.Name += ":xnack+";
           else if (MI.XnackSupported)
             MI.Name += ":xnack-";
-        } else if (code_object_major_version == 4 || code_object_major_version == 5) {
+        } else if (code_object_major_version >= 4) {
           switch (img->EFlags() & ELF::EF_AMDGPU_FEATURE_SRAMECC_V4) {
           case ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4:
             MI.Name += ":sramecc-";
@@ -758,6 +772,12 @@ namespace code {
             MI.Name += ":xnack+";
             break;
           }
+
+          // Generic version is not part of the ISA name.
+          // Only parse it when the caller wants it.
+          if (genericVersion && code_object_major_version >= 6) {
+            *genericVersion = (img->EFlags() & ELF::EF_AMDGPU_GENERIC_VERSION) >> ELF::EF_AMDGPU_GENERIC_VERSION_OFFSET;
+          }
         } else {
           return false;
         }
@@ -936,7 +956,7 @@ namespace code {
       uint64_t offset = ImageInitSection()->addData(&desc, sizeof(desc), 8);
       amd::elf::Symbol* imageInit =
         img->symtab()->addSymbol(ImageInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL);
-      image->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_IMAGE, imageInit, image->elfSym()->value() + destOffset, 0);
+      image->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_V1_INIT_IMAGE, imageInit, image->elfSym()->value() + destOffset, 0);
     }
 
     void AmdHsaCode::AddImageInitializer(
@@ -977,7 +997,7 @@ namespace code {
       uint64_t offset = SamplerInitSection()->addData(&desc, sizeof(desc), 8);
       amd::elf::Symbol* samplerInit =
         img->symtab()->addSymbol(SamplerInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL);
-      sampler->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_SAMPLER, samplerInit, sampler->elfSym()->value() + destOffset, 0);
+      sampler->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_V1_INIT_SAMPLER, samplerInit, sampler->elfSym()->value() + destOffset, 0);
     }
 
     void AmdHsaCode::AddSamplerInitializer(Symbol* sampler, uint64_t destOffset,
@@ -996,7 +1016,7 @@ namespace code {
 
     void AmdHsaCode::AddInitVarWithAddress(bool large, Symbol* dest, uint64_t destOffset, Symbol* addrOf, uint64_t addrAddend)
     {
-      uint32_t rtype = large ? R_AMDGPU_64 : R_AMDGPU_32_LOW;
+      uint32_t rtype = large ? R_AMDGPU_V1_64 : R_AMDGPU_V1_32_LOW;
       dest->elfSym()->section()->relocationSection()->addRelocation(rtype, addrOf->elfSym(), dest->elfSym()->value() + destOffset, addrAddend);
     }
 
diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp
index 2a3b96f24..a6ea83c33 100644
--- a/src/loader/executable.cpp
+++ b/src/loader/executable.cpp
@@ -81,8 +81,10 @@ __attribute__((noinline)) static void _loader_debug_state() {
 // 6: New trap handler ABI. ttmp6[25:0] contains dispatch index modulo queue size
 // 7: New trap handler ABI. Send interrupts as a bitmask, coalescing concurrent exceptions.
 // 8: New trap handler ABI. for gfx940: Initialize ttmp[4:5] if ttmp11[31] == 0.
-// 9: New trap handler API. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped.
-HSA_API r_debug _amdgpu_r_debug = {9,
+// 9: New trap handler ABI. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped.
+// 10: New trap handler ABI. Set status.skip_export when halting the wave.
+//                           For gfx940, set ttmp6[31] = 0 if ttmp11[31] == 0.
+HSA_API r_debug _amdgpu_r_debug = {10,
                            nullptr,
                            reinterpret_cast<uintptr_t>(&_loader_debug_state),
                            r_debug::RT_CONSISTENT,
@@ -186,6 +188,18 @@ Executable* AmdHsaCodeLoader::CreateExecutable(
   return executables.back();
 }
 
+Executable* AmdHsaCodeLoader::CreateExecutable(
+      std::unique_ptr<Context> isolated_context,
+      hsa_profile_t profile,
+      const char *options,
+      hsa_default_float_rounding_mode_t default_float_rounding_mode)
+{
+  WriterLockGuard<ReaderWriterLock> writer_lock(rw_lock_);
+
+  executables.push_back(new ExecutableImpl(profile, std::move(isolated_context), executables.size(), default_float_rounding_mode));
+  return executables.back();
+}
+
 static void AddCodeObjectInfoIntoDebugMap(link_map* map) {
   if (r_debug_tail) {
       r_debug_tail->l_next = map;
@@ -522,6 +536,10 @@ bool KernelSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) {
       *((bool*)value) = is_dynamic_callstack;
       break;
     }
+    case HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE: {
+      *((uint32_t*)value) = wavefront_size;
+      break;
+    }
     case HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE: {
       *((uint32_t*)value) = size;
       break;
@@ -734,6 +752,22 @@ ExecutableImpl::ExecutableImpl(
 {
 }
 
+ExecutableImpl::ExecutableImpl(
+    const hsa_profile_t &_profile,
+    std::unique_ptr<Context> unique_context,
+    size_t id,
+    hsa_default_float_rounding_mode_t default_float_rounding_mode)
+  : Executable()
+  , profile_(_profile)
+  , unique_context_(std::move(unique_context))
+  , id_(id)
+  , default_float_rounding_mode_(default_float_rounding_mode)
+  , state_(HSA_EXECUTABLE_STATE_UNFROZEN)
+  , program_allocation_segment(nullptr)
+{
+  context_ = unique_context_.get();
+}
+
 ExecutableImpl::~ExecutableImpl() {
   for (ExecutableObject* o : objects) {
     o->Destroy();
@@ -1212,7 +1246,8 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
   }
 
   std::string codeIsa;
-  if (!code->GetIsa(codeIsa)) {
+  unsigned genericVersion;
+  if (!code->GetIsa(codeIsa, &genericVersion)) {
     logger_ << "LoaderError: failed to determine code object's ISA\n";
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
@@ -1223,7 +1258,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
 
-  if (majorVersion < 1 || majorVersion > 5) {
+  if (majorVersion < 1 || majorVersion > 6) {
     logger_ << "LoaderError: unsupported code object version: " << majorVersion << "\n";
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
@@ -1251,7 +1286,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
     return HSA_STATUS_ERROR_INVALID_ISA_NAME;
   }
 
-  if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa)) {
+  if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa, genericVersion)) {
     logger_ << "LoaderError: code object's ISA (" << codeIsa.c_str() << ") is not supported by the agent\n";
     return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
   }
@@ -1434,6 +1469,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
     uint32_t group_segment_size = kd.group_segment_fixed_size;
     uint32_t private_segment_size = kd.private_segment_fixed_size;
     bool is_dynamic_callstack = AMDHSA_BITS_GET(kd.kernel_code_properties, rocr::llvm::amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
+    bool uses_wave32 = AMDHSA_BITS_GET( kd.kernel_code_properties, rocr::llvm::amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
 
     uint64_t size = sym->Size();
 
@@ -1449,6 +1485,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
                                     is_dynamic_callstack,
                                     size,
                                     64,
+                                    uses_wave32 ? 32 : 64,
                                     address);
     symbol = kernel_symbol;
   } else if (sym->IsVariableSymbol()) {
@@ -1478,6 +1515,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
         uint32_t(akc.workitem_private_segment_byte_size);
       bool is_dynamic_callstack =
         AMD_HSA_BITS_GET(akc.kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK) ? true : false;
+      bool uses_wave32 = akc.wavefront_size == AMD_POWERTWO_32;
 
       uint64_t size = sym->Size();
 
@@ -1498,6 +1536,7 @@ hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent,
                                       is_dynamic_callstack,
                                       size,
                                       256,
+                                      uses_wave32 ? 32 : 64,
                                       address);
       kernel_symbol->debug_info.elf_raw = code->ElfData();
       kernel_symbol->debug_info.elf_size = code->ElfSize();
@@ -1585,15 +1624,24 @@ Segment* ExecutableImpl::SectionSegment(hsa_agent_t agent, code::Section* sec)
 hsa_status_t ExecutableImpl::ApplyRelocations(hsa_agent_t agent, amd::hsa::code::AmdHsaCode *c)
 {
   hsa_status_t status = HSA_STATUS_SUCCESS;
+
+  uint32_t majorVersion, minorVersion;
+  if (!c->GetCodeObjectVersion(&majorVersion, &minorVersion)) {
+    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+  }
+
   for (size_t i = 0; i < c->RelocationSectionCount(); ++i) {
     if (c->GetRelocationSection(i)->targetSection()) {
+      // Static relocations may be present if --emit-relocs
+      // option was passed to lld, but they cannot be applied
+      // again, so skip it for code object v2 and up.
+      if (majorVersion >= 2) {
+        continue;
+      }
+
       status = ApplyStaticRelocationSection(agent, c->GetRelocationSection(i));
     } else {
       // Dynamic relocations are supported starting code object v2.1.
-      uint32_t majorVersion, minorVersion;
-      if (!c->GetCodeObjectVersion(&majorVersion, &minorVersion)) {
-        return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-      }
       if (majorVersion < 2) {
         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
       }
@@ -1628,9 +1676,9 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa::
   Segment* rseg = SectionSegment(agent, sec);
   size_t reladdr = sec->addr() + rel->offset();
   switch (rel->type()) {
-    case R_AMDGPU_32_LOW:
-    case R_AMDGPU_32_HIGH:
-    case R_AMDGPU_64:
+    case R_AMDGPU_V1_32_LOW:
+    case R_AMDGPU_V1_32_HIGH:
+    case R_AMDGPU_V1_64:
     {
       uint64_t addr;
       switch (sym->type()) {
@@ -1661,15 +1709,15 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa::
 
       uint32_t addr32 = 0;
       switch (rel->type()) {
-        case R_AMDGPU_32_HIGH:
+        case R_AMDGPU_V1_32_HIGH:
           addr32 = uint32_t((addr >> 32) & 0xFFFFFFFF);
           rseg->Copy(reladdr, &addr32, sizeof(addr32));
           break;
-        case R_AMDGPU_32_LOW:
+        case R_AMDGPU_V1_32_LOW:
           addr32 = uint32_t(addr & 0xFFFFFFFF);
           rseg->Copy(reladdr, &addr32, sizeof(addr32));
           break;
-        case R_AMDGPU_64:
+        case R_AMDGPU_V1_64:
           rseg->Copy(reladdr, &addr, sizeof(addr));
           break;
         default:
@@ -1678,7 +1726,7 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa::
       break;
     }
 
-    case R_AMDGPU_INIT_SAMPLER:
+    case R_AMDGPU_V1_INIT_SAMPLER:
     {
       if (STT_AMDGPU_HSA_METADATA != sym->type() ||
           SHT_PROGBITS != sym->section()->type() ||
@@ -1709,7 +1757,7 @@ hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa::
       break;
     }
 
-    case R_AMDGPU_INIT_IMAGE:
+    case R_AMDGPU_V1_INIT_IMAGE:
     {
       if (STT_AMDGPU_HSA_METADATA != sym->type() ||
           SHT_PROGBITS != sym->section()->type() ||
@@ -1822,7 +1870,7 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa:
   symAddr += rel->addend();
 
   switch (rel->type()) {
-    case R_AMDGPU_32_HIGH:
+    case ELF::R_AMDGPU_ABS32_HI:
     {
       if (!symAddr) {
         logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n";
@@ -1834,7 +1882,7 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa:
       break;
     }
 
-    case R_AMDGPU_32_LOW:
+    case ELF::R_AMDGPU_ABS32_LO:
     {
       if (!symAddr) {
         logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n";
@@ -1846,7 +1894,19 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa:
       break;
     }
 
-    case R_AMDGPU_64:
+    case ELF::R_AMDGPU_ABS32:
+    {
+      if (!symAddr) {
+        logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n";
+        return HSA_STATUS_ERROR_VARIABLE_UNDEFINED;
+      }
+
+      uint32_t symAddr32 = uint32_t(symAddr);
+      relSeg->Copy(rel->offset(), &symAddr32, sizeof(symAddr32));
+      break;
+    }
+
+    case ELF::R_AMDGPU_ABS64:
     {
       if (!symAddr) {
         logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n";
@@ -1857,7 +1917,7 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa:
       break;
     }
 
-    case R_AMDGPU_RELATIVE64:
+    case ELF::R_AMDGPU_RELATIVE64:
     {
       int64_t baseDelta = reinterpret_cast<uint64_t>(relSeg->Address(0)) - relSeg->VAddr();
       uint64_t relocatedAddr = baseDelta + rel->addend();
diff --git a/src/loader/executable.hpp b/src/loader/executable.hpp
index c360df348..9429ff948 100644
--- a/src/loader/executable.hpp
+++ b/src/loader/executable.hpp
@@ -144,6 +144,7 @@ class KernelSymbol final: public SymbolImpl {
                const bool &_is_dynamic_callstack,
                const uint32_t &_size,
                const uint32_t &_alignment,
+               const uint32_t &_wavefront_size,
                const uint64_t &_address = 0)
     : SymbolImpl(_is_loaded,
                  HSA_SYMBOL_KIND_KERNEL,
@@ -159,7 +160,8 @@ class KernelSymbol final: public SymbolImpl {
     , private_segment_size(_private_segment_size)
     , is_dynamic_callstack(_is_dynamic_callstack)
     , size(_size)
-    , alignment(_alignment) {}
+    , alignment(_alignment)
+    , wavefront_size(_wavefront_size) {}
 
   ~KernelSymbol() {}
 
@@ -173,6 +175,7 @@ class KernelSymbol final: public SymbolImpl {
   bool is_dynamic_callstack;
   uint32_t size;
   uint32_t alignment;
+  uint32_t wavefront_size;
   amd_runtime_loader_debug_info_t debug_info;
 
 private:
@@ -420,6 +423,12 @@ friend class AmdHsaCodeLoader;
       size_t id,
       hsa_default_float_rounding_mode_t default_float_rounding_mode);
 
+  ExecutableImpl(
+      const hsa_profile_t &_profile,
+      std::unique_ptr<Context> unique_context,
+      size_t id,
+      hsa_default_float_rounding_mode_t default_float_rounding_mode);
+
   ~ExecutableImpl();
 
   hsa_status_t GetInfo(hsa_executable_info_t executable_info, void *value) override;
@@ -546,6 +555,7 @@ friend class AmdHsaCodeLoader;
   amd::hsa::common::ReaderWriterLock rw_lock_;
   hsa_profile_t profile_;
   Context *context_;
+  std::unique_ptr<Context> unique_context_;
   Logger logger_;
   const size_t id_;
   hsa_default_float_rounding_mode_t default_float_rounding_mode_;
@@ -575,6 +585,12 @@ class AmdHsaCodeLoader : public Loader {
       const char *options,
       hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) override;
 
+  Executable* CreateExecutable(
+      std::unique_ptr<Context> isolated_context,
+      hsa_profile_t profile,
+      const char *options,
+      hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) override;
+
   hsa_status_t FreezeExecutable(Executable *executable, const char *options) override;
   void DestroyExecutable(Executable *executable) override;
 
diff --git a/src/pcs/hsa_ven_amd_pc_sampling.cpp b/src/pcs/hsa_ven_amd_pc_sampling.cpp
new file mode 100644
index 000000000..f57d7353c
--- /dev/null
+++ b/src/pcs/hsa_ven_amd_pc_sampling.cpp
@@ -0,0 +1,166 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "pcs_runtime.h"
+#include "core/inc/agent.h"
+#include "core/inc/amd_gpu_agent.h"
+#include "core/inc/exceptions.h"
+
+namespace rocr {
+namespace AMD {
+hsa_status_t handleException();
+
+template <class T> static __forceinline T handleExceptionT() {
+  handleException();
+  abort();
+  return T();
+}
+}  // namespace AMD
+
+#define IS_OPEN()                                                                                  \
+  do {                                                                                             \
+    if (!core::Runtime::runtime_singleton_->IsOpen()) return HSA_STATUS_ERROR_NOT_INITIALIZED;     \
+  } while (false)
+
+template <class T> static __forceinline bool IsValid(T* ptr) {
+  return (ptr == NULL) ? NULL : ptr->IsValid();
+}
+
+#define TRY try {
+#define CATCH                                                                                      \
+  }                                                                                                \
+  catch (...) {                                                                                    \
+    return AMD::handleException();                                                                 \
+  }
+#define CATCHRET(RETURN_TYPE)                                                                      \
+  }                                                                                                \
+  catch (...) {                                                                                    \
+    return AMD::handleExceptionT<RETURN_TYPE>();                                                   \
+  }
+
+namespace pcs {
+
+hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t hsa_agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data) {
+  TRY;
+  IS_OPEN();
+
+  core::Agent* agent = core::Agent::Convert(hsa_agent);
+  if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice)
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+
+  return PcsRuntime::instance()->PcSamplingIterateConfig(agent, configuration_callback,
+                                                         callback_data);
+  CATCH;
+}
+
+hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t hsa_agent, hsa_ven_amd_pcs_method_kind_t method,
+                                    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                    size_t buffer_size,
+                                    hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                    void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
+  TRY;
+  IS_OPEN();
+  core::Agent* agent = core::Agent::Convert(hsa_agent);
+  if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice)
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+
+  return PcsRuntime::instance()->PcSamplingCreate(
+      agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle);
+  CATCH;
+}
+
+hsa_status_t hsa_ven_amd_pcs_create_from_id(uint32_t pcs_id, hsa_agent_t hsa_agent,
+                                            hsa_ven_amd_pcs_method_kind_t method,
+                                            hsa_ven_amd_pcs_units_t units, size_t interval,
+                                            size_t latency, size_t buffer_size,
+                                            hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                            void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
+  TRY;
+  IS_OPEN();
+  core::Agent* agent = core::Agent::Convert(hsa_agent);
+  if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice)
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+
+  return PcsRuntime::instance()->PcSamplingCreateFromId(pcs_id, agent, method, units, interval,
+                                                        latency, buffer_size, data_ready_cb,
+                                                        client_cb_data, handle);
+  CATCH;
+}
+
+hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t handle) {
+  TRY;
+  return PcsRuntime::instance()->PcSamplingDestroy(handle);
+  CATCH;
+}
+
+hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t handle) {
+  TRY;
+  return PcsRuntime::instance()->PcSamplingStart(handle);
+  CATCH;
+}
+
+hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t handle) {
+  TRY;
+  return PcsRuntime::instance()->PcSamplingStop(handle);
+  CATCH;
+}
+
+hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t handle) {
+  TRY;
+  return PcsRuntime::instance()->PcSamplingFlush(handle);
+  CATCH;
+}
+
+void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api) {
+  pcs_api->hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ven_amd_pcs_iterate_configuration;
+  pcs_api->hsa_ven_amd_pcs_create_fn = hsa_ven_amd_pcs_create;
+  pcs_api->hsa_ven_amd_pcs_create_from_id_fn = hsa_ven_amd_pcs_create_from_id;
+  pcs_api->hsa_ven_amd_pcs_destroy_fn = hsa_ven_amd_pcs_destroy;
+  pcs_api->hsa_ven_amd_pcs_start_fn = hsa_ven_amd_pcs_start;
+  pcs_api->hsa_ven_amd_pcs_stop_fn = hsa_ven_amd_pcs_stop;
+  pcs_api->hsa_ven_amd_pcs_flush_fn = hsa_ven_amd_pcs_flush;
+}
+
+}  //  namespace pcs
+}  //  namespace rocr
diff --git a/src/pcs/inc/hsa_ven_amd_pc_sampling_impl.h b/src/pcs/inc/hsa_ven_amd_pc_sampling_impl.h
new file mode 100644
index 000000000..58ed4d437
--- /dev/null
+++ b/src/pcs/inc/hsa_ven_amd_pc_sampling_impl.h
@@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_VEN_AMD_PC_SAMPLING_IMPL_H
+#define HSA_VEN_AMD_PC_SAMPLING_IMPL_H
+
+#include "inc/hsa.h"
+#include "inc/hsa_ext_amd.h"
+#include "inc/hsa_ven_amd_pc_sampling.h"
+#include "core/inc/hsa_ext_interface.h"
+
+//---------------------------------------------------------------------------//
+//  APIs that implement PC Sampling functionality
+//---------------------------------------------------------------------------//
+
+namespace rocr {
+namespace pcs {
+
+hsa_status_t hsa_ven_amd_pcs_iterate_configuration(
+    hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data);
+
+hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+                                    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                    size_t buffer_size,
+                                    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                                    void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling);
+
+hsa_status_t hsa_ven_amd_pcs_create_from_id(
+    uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method,
+    hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data,
+    hsa_ven_amd_pcs_t* pc_sampling);
+
+hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling);
+
+hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling);
+
+hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling);
+
+hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling);
+
+// Update Api table with func pointers that implement functionality
+void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api);
+
+// Release resources acquired by Image implementation
+void ReleasePcSamplingRsrcs();
+
+}  // namespace pcs
+}  // namespace rocr
+
+#endif  //  HSA_VEN_AMD_PC_SAMPLING_IMPL_H
diff --git a/src/pcs/pcs_runtime.cpp b/src/pcs/pcs_runtime.cpp
new file mode 100644
index 000000000..9d453bb31
--- /dev/null
+++ b/src/pcs/pcs_runtime.cpp
@@ -0,0 +1,355 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "pcs_runtime.h"
+
+#include <assert.h>
+#include <mutex>
+
+#include "core/inc/runtime.h"
+
+#include "core/inc/amd_gpu_agent.h"
+
+namespace rocr {
+namespace pcs {
+
+#define IS_BAD_PTR(ptr)                                          \
+do {                                                           \
+  if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \
+} while (false)
+
+std::atomic<PcsRuntime*> PcsRuntime::instance_(NULL);
+std::mutex PcsRuntime::instance_mutex_;
+
+PcsRuntime* PcsRuntime::instance() {
+  PcsRuntime* instance = instance_.load(std::memory_order_acquire);
+  if (instance == NULL) {
+    // Protect the initialization from multi threaded access.
+    std::lock_guard<std::mutex> lock(instance_mutex_);
+
+    // Make sure we are not initializing it twice.
+    instance = instance_.load(std::memory_order_relaxed);
+    if (instance != NULL) {
+      return instance;
+    }
+
+    instance = CreateSingleton();
+    if (instance == NULL) {
+      return NULL;
+    }
+  }
+
+  return instance;
+}
+
+PcsRuntime* PcsRuntime::CreateSingleton() {
+  PcsRuntime* instance = new PcsRuntime();
+
+  instance_.store(instance, std::memory_order_release);
+  return instance;
+}
+
+void PcsRuntime::DestroySingleton() {
+  PcsRuntime* instance = instance_.load(std::memory_order_acquire);
+  if (instance == NULL) {
+    return;
+  }
+
+  instance_.store(NULL, std::memory_order_release);
+  delete instance;
+}
+
+void ReleasePcSamplingRsrcs() { PcsRuntime::DestroySingleton(); }
+
+bool PcsRuntime::SessionsActive() const {
+  return pc_sampling_.size() > 0;
+}
+
+PcsRuntime::PcSamplingSession::PcSamplingSession(
+    core::Agent* _agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units,
+    size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data)
+    : agent(_agent), thunkId_(0), active_(false), valid_(true), sample_size_(0) {
+  switch (method) {
+    case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1:
+      sample_size_ = sizeof(perf_sample_hosttrap_v1_t);
+      break;
+    case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1:
+      sample_size_ = sizeof(perf_sample_snapshot_v1_t);
+      break;
+    default:
+      valid_ = false;
+      return;
+  }
+
+  if (!interval || !buffer_size || (buffer_size % (2 * sample_size_))) {
+    valid_ = false;
+    return;
+  }
+
+  csd.method = method;
+  csd.units = units;
+  csd.interval = interval;
+  csd.latency = latency;
+  csd.buffer_size = buffer_size;
+  csd.data_ready_callback = data_ready_callback;
+  csd.client_callback_data = client_callback_data;
+}
+
+void PcsRuntime::PcSamplingSession::GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo) {
+  sampleInfo->value_min = 0;
+  sampleInfo->value_max = 0;
+  sampleInfo->flags = 0;
+  sampleInfo->value = csd.interval;
+
+  switch (csd.method) {
+    case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1:
+      sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1;
+      break;
+    case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1:
+      sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1;
+      break;
+  }
+
+  switch (csd.units) {
+    case HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS:
+      sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS;
+      break;
+    case HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES:
+      sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES;
+      break;
+    case HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS:
+      sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS;
+      break;
+  }
+}
+
+hsa_status_t PcSamplingDataCopyCallback(void* _session, size_t bytes_to_copy, void* destination) {
+  assert(_session);
+  assert(destination);
+
+  PcsRuntime::PcSamplingSession* session =
+      reinterpret_cast<PcsRuntime::PcSamplingSession*>(_session);
+
+  return session->DataCopyCallback(reinterpret_cast<uint8_t*>(destination), bytes_to_copy);
+}
+
+hsa_status_t PcsRuntime::PcSamplingSession::DataCopyCallback(uint8_t* buffer,
+                                                             size_t bytes_to_copy) {
+  if (bytes_to_copy != (data_rdy.buf1_sz + data_rdy.buf2_sz)) return HSA_STATUS_ERROR_EXCEPTION;
+
+  if (data_rdy.buf1_sz) memcpy(buffer, data_rdy.buf1, data_rdy.buf1_sz);
+  if (data_rdy.buf2_sz) memcpy(buffer + data_rdy.buf1_sz, data_rdy.buf2, data_rdy.buf2_sz);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t PcsRuntime::PcSamplingSession::HandleSampleData(uint8_t* buf1, size_t buf1_sz,
+                                                             uint8_t* buf2, size_t buf2_sz,
+                                                             size_t lost_sample_count) {
+  data_rdy.buf1 = buf1;
+  data_rdy.buf1_sz = buf1_sz;
+  data_rdy.buf2 = buf2;
+  data_rdy.buf2_sz = buf2_sz;
+
+  AMD::GpuAgent* gpuAgent = static_cast<AMD::GpuAgent*>(agent);
+
+  switch (csd.method) {
+    case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: {
+      size_t buf_samples = buf1_sz / sizeof(perf_sample_hosttrap_v1_t);
+      perf_sample_hosttrap_v1_t* samples = reinterpret_cast<perf_sample_hosttrap_v1_t*>(buf1);
+      while (buf_samples--) {
+        samples->timestamp = gpuAgent->TranslateTime(samples->timestamp);
+        samples++;
+      }
+
+      buf_samples = buf2_sz / sizeof(perf_sample_hosttrap_v1_t);
+      samples = reinterpret_cast<perf_sample_hosttrap_v1_t*>(buf2);
+      while (buf_samples--) {
+        samples->timestamp = gpuAgent->TranslateTime(samples->timestamp);
+        samples++;
+      }
+    }
+    break;
+    case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1: {
+      size_t buf_samples = buf1_sz / sizeof(perf_sample_snapshot_v1_t);
+      perf_sample_snapshot_v1_t* samples = reinterpret_cast<perf_sample_snapshot_v1_t*>(buf1);
+      while (buf_samples--) {
+        samples->timestamp = gpuAgent->TranslateTime(samples->timestamp);
+        samples++;
+      }
+
+      buf_samples = buf2_sz / sizeof(perf_sample_snapshot_v1_t);
+      samples = reinterpret_cast<perf_sample_snapshot_v1_t*>(buf2);
+      while (buf_samples--) {
+        samples->timestamp = gpuAgent->TranslateTime(samples->timestamp);
+        samples++;
+      }
+    }
+    break;
+  }
+
+  csd.data_ready_callback(csd.client_callback_data, buf1_sz + buf2_sz, lost_sample_count,
+                          &PcSamplingDataCopyCallback,
+                          /* hsa_callback_data*/ this);
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t PcsRuntime::PcSamplingIterateConfig(
+    core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+    void* callback_data) {
+  AMD::GpuAgentInt* gpu_agent = static_cast<AMD::GpuAgentInt*>(agent);
+  return gpu_agent->PcSamplingIterateConfig(configuration_callback, callback_data);
+}
+
+hsa_status_t PcsRuntime::PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
+                                          hsa_ven_amd_pcs_units_t units, size_t interval,
+                                          size_t latency, size_t buffer_size,
+                                          hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                          void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
+
+  IS_BAD_PTR(handle);
+  IS_BAD_PTR(data_ready_cb);
+
+  return PcSamplingCreateInternal(
+      agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle,
+      [](core::Agent* agent_, PcSamplingSession& session_) {
+        return static_cast<AMD::GpuAgentInt*>(agent_)->PcSamplingCreate(session_);
+      });
+}
+
+hsa_status_t PcsRuntime::PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent,
+                                                hsa_ven_amd_pcs_method_kind_t method,
+                                                hsa_ven_amd_pcs_units_t units, size_t interval,
+                                                size_t latency, size_t buffer_size,
+                                                hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                                void* client_cb_data, hsa_ven_amd_pcs_t* handle) {
+  IS_BAD_PTR(handle);
+  IS_BAD_PTR(data_ready_cb);
+
+  return PcSamplingCreateInternal(
+      agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle,
+      [&](core::Agent* agent_, PcSamplingSession& session_) {
+        return static_cast<AMD::GpuAgentInt*>(agent_)->PcSamplingCreateFromId(ioctl_pcs_id,
+                                                                              session_);
+      });
+}
+
+hsa_status_t PcsRuntime::PcSamplingCreateInternal(
+    core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units,
+    size_t interval, size_t latency, size_t buffer_size,
+    hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data,
+    hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) {
+  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+
+  handle->handle = ++pc_sampling_id_;
+  // create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size,
+  // data_ready_cb, client_cb_data) reference and insert into pc_sampling_
+  pc_sampling_.emplace(std::piecewise_construct, std::forward_as_tuple(handle->handle),
+                       std::forward_as_tuple(agent, method, units, interval, latency, buffer_size,
+                                             data_ready_cb, client_cb_data));
+
+  if (!pc_sampling_[handle->handle].isValid()) {
+      pc_sampling_.erase(handle->handle);
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  hsa_status_t ret = agent_pcs_create_fn(agent, pc_sampling_[handle->handle]);
+  if (ret != HSA_STATUS_SUCCESS) {
+    pc_sampling_.erase(handle->handle);
+    return ret;
+  }
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) {
+  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast<uint64_t>(handle.handle));
+  if (pcSamplingSessionIt == pc_sampling_.end()) {
+    debug_warning(false && "Cannot find PcSampling session");
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  AMD::GpuAgentInt* gpu_agent = static_cast<AMD::GpuAgentInt*>(pcSamplingSessionIt->second.agent);
+
+  hsa_status_t ret = gpu_agent->PcSamplingDestroy(pcSamplingSessionIt->second);
+  pc_sampling_.erase(pcSamplingSessionIt);
+  return ret;
+}
+
+hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) {
+  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast<uint64_t>(handle.handle));
+  if (pcSamplingSessionIt == pc_sampling_.end()) {
+    debug_warning(false && "Cannot find PcSampling session");
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  AMD::GpuAgentInt* gpu_agent = static_cast<AMD::GpuAgentInt*>(pcSamplingSessionIt->second.agent);
+
+  return gpu_agent->PcSamplingStart(pcSamplingSessionIt->second);
+}
+
+hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) {
+  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast<uint64_t>(handle.handle));
+  if (pcSamplingSessionIt == pc_sampling_.end()) {
+    debug_warning(false && "Cannot find PcSampling session");
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  AMD::GpuAgentInt* gpu_agent = static_cast<AMD::GpuAgentInt*>(pcSamplingSessionIt->second.agent);
+
+  return gpu_agent->PcSamplingStop(pcSamplingSessionIt->second);
+}
+
+hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) {
+  ScopedAcquire<KernelMutex> lock(&pc_sampling_lock_);
+  auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast<uint64_t>(handle.handle));
+  if (pcSamplingSessionIt == pc_sampling_.end()) {
+    debug_warning(false && "Cannot find PcSampling session");
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  AMD::GpuAgentInt* gpu_agent = static_cast<AMD::GpuAgentInt*>(pcSamplingSessionIt->second.agent);
+
+  return gpu_agent->PcSamplingFlush(pcSamplingSessionIt->second);
+}
+
+}  // namespace pcs
+}  // namespace rocr
diff --git a/src/pcs/pcs_runtime.h b/src/pcs/pcs_runtime.h
new file mode 100644
index 000000000..6fa489c73
--- /dev/null
+++ b/src/pcs/pcs_runtime.h
@@ -0,0 +1,176 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_PCS_RUNTIME_H
+#define HSA_RUNTIME_PCS_RUNTIME_H
+
+#include <atomic>
+#include <map>
+#include <mutex>
+
+#include "hsakmt/hsakmt.h"
+
+#include "hsa_ven_amd_pc_sampling.h"
+#include "core/inc/agent.h"
+#include "core/inc/exceptions.h"
+
+
+namespace rocr {
+namespace pcs {
+
+class PcsRuntime {
+ public:
+  PcsRuntime() : pc_sampling_id_(0) {}
+  ~PcsRuntime() {}
+
+  /// @brief Getter for the PcsRuntime singleton object.
+  static PcsRuntime* instance();
+
+  bool SessionsActive() const;
+
+  /// @brief Destroy singleton object.
+  static void DestroySingleton();
+
+  class PcSamplingSession {
+   public:
+    PcSamplingSession() : agent(NULL), thunkId_(0), active_(false){};
+    PcSamplingSession(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
+                      hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                      size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback,
+                      void* client_callback_data);
+    ~PcSamplingSession(){};
+
+    const bool isValid() { return valid_; }
+    const size_t buffer_size() { return csd.buffer_size; }
+    const hsa_ven_amd_pcs_method_kind_t method() { return csd.method; }
+    const size_t latency() { return csd.latency; }
+    const size_t sample_size() { return sample_size_; }
+
+    void GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo);
+    hsa_status_t HandleSampleData(uint8_t* buf1, size_t buf1_sz, uint8_t* buf2, size_t buf2_sz,
+                                  size_t lost_sample_count);
+    hsa_status_t DataCopyCallback(uint8_t* buffer, size_t buffer_size);
+
+    core::Agent* agent;
+    void SetThunkId(HsaPcSamplingTraceId thunkId) { thunkId_ = thunkId; }
+    HsaPcSamplingTraceId ThunkId() { return thunkId_; }
+    bool isActive() { return active_; }
+    void start() { active_ = true; }
+    void stop() { active_ = false; }
+
+   private:
+    HsaPcSamplingTraceId thunkId_;
+
+    bool active_;  // Set to true when the session is started
+    bool valid_;   // Whether configuration parameters are valid
+    size_t sample_size_;
+
+    struct client_session_data_t {
+      hsa_ven_amd_pcs_method_kind_t method;
+      hsa_ven_amd_pcs_units_t units;
+      size_t interval;
+      size_t latency;
+      size_t buffer_size;
+      hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback;
+      void* client_callback_data;
+    };
+    struct client_session_data_t csd;
+
+    struct data_ready_info_t {
+      uint8_t* buf1;
+      size_t buf1_sz;
+      uint8_t* buf2;
+      size_t buf2_sz;
+    };
+    struct data_ready_info_t data_rdy;
+  };  // class PcSamplingSession
+
+  hsa_status_t PcSamplingIterateConfig(
+      core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback,
+      void* callback_data);
+
+  hsa_status_t PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
+                                hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency,
+                                size_t buffer_size,
+                                hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                void* client_cb_data, hsa_ven_amd_pcs_t* handle);
+
+
+  hsa_status_t PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent,
+                                      hsa_ven_amd_pcs_method_kind_t method,
+                                      hsa_ven_amd_pcs_units_t units, size_t interval,
+                                      size_t latency, size_t buffer_size,
+                                      hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                      void* client_cb_data, hsa_ven_amd_pcs_t* handle);
+
+  hsa_status_t PcSamplingDestroy(hsa_ven_amd_pcs_t handle);
+  hsa_status_t PcSamplingStart(hsa_ven_amd_pcs_t handle);
+  hsa_status_t PcSamplingStop(hsa_ven_amd_pcs_t handle);
+  hsa_status_t PcSamplingFlush(hsa_ven_amd_pcs_t handle);
+
+ private:
+  /// @brief Initialize singleton object, must be called once.
+  static PcsRuntime* CreateSingleton();
+
+  /// Pointer to singleton object.
+  static std::atomic<PcsRuntime*> instance_;
+  static std::mutex instance_mutex_;
+
+  // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle
+  std::map<uint64_t, PcSamplingSession> pc_sampling_;
+  KernelMutex pc_sampling_lock_;
+  uint64_t pc_sampling_id_;
+
+  DISALLOW_COPY_AND_ASSIGN(PcsRuntime);
+
+  using agent_pcs_create_fn_t = std::function<hsa_status_t(core::Agent*, PcSamplingSession&)>;
+  hsa_status_t PcSamplingCreateInternal(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method,
+                                        hsa_ven_amd_pcs_units_t units, size_t interval,
+                                        size_t latency, size_t buffer_size,
+                                        hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb,
+                                        void* client_cb_data, hsa_ven_amd_pcs_t* handle,
+                                        agent_pcs_create_fn_t agent_pcs_create_fn);
+};
+
+}  // namespace pcs
+}  // namespace rocr
+#endif  // HSA_RUNTIME_PCS_RUNTIME_H