From 635157ea802c95b5f9de740eeef2e92fb296515c Mon Sep 17 00:00:00 2001
From: Sean Keely <Sean.Keely@amd.com>
Date: Wed, 9 Feb 2022 21:11:38 -0600
Subject: [PATCH] ROCm 5.0.0 updates

---
 src/CMakeLists.txt                     |   9 +-
 src/core/inc/agent.h                   |  15 +++
 src/core/inc/amd_memory_region.h       |  10 +-
 src/core/inc/queue.h                   |   2 +-
 src/core/inc/runtime.h                 |   8 +-
 src/core/runtime/amd_aql_queue.cpp     |   2 +-
 src/core/runtime/amd_gpu_agent.cpp     |  42 ++++++--
 src/core/runtime/amd_memory_region.cpp |  22 +++-
 src/core/runtime/hsa_ext_amd.cpp       |  22 ++--
 src/core/runtime/isa.cpp               |   3 +
 src/core/runtime/runtime.cpp           |  37 +++----
 src/core/util/flag.cpp                 |   4 +-
 src/core/util/flag.h                   |  14 +++
 src/core/util/lnx/os_linux.cpp         |  58 ++++++++++
 src/core/util/locks.h                  | 142 ++++++++++++++++++++-----
 src/core/util/os.h                     |  43 ++++++++
 src/core/util/utils.h                  |   1 +
 src/core/util/win/os_win.cpp           |  46 ++++++++
 src/image/blit_kernel.cpp              |   3 +
 src/image/blit_src/CMakeLists.txt      |   2 +-
 src/image/util.h                       |   1 +
 src/inc/amd_hsa_elf.h                  |   3 +-
 src/inc/hsa_ext_amd.h                  |   8 +-
 src/libamdhsacode/amd_hsa_code.cpp     |   1 +
 24 files changed, 413 insertions(+), 85 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index efc441841..8fb02b14e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -85,7 +85,7 @@ if (ROCM_CCACHE_BUILD)
 endif() # if (ROCM_CCACHE_BUILD)
 
 ## Get version strings
-get_version ( "1.4.0" )
+get_version ( "1.5.0" )
 if ( ${ROCM_PATCH_VERSION} )
   set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
 endif()
@@ -126,7 +126,7 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
 set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64" )
 
 ## ------------------------- Linux Compiler and Linker options -------------------------
-set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -Werror -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function )
+set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function )
 
 ## Extra image settings - audit!
 set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -Wno-deprecated-declarations )
@@ -301,7 +301,7 @@ install ( TARGETS ${CORE_RUNTIME_TARGET} EXPORT ${CORE_RUNTIME_NAME}Targets
   LIBRARY DESTINATION lib COMPONENT binary )
 
 # Install license
-#install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )
+install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )
 
 # Install public headers
 # TODO: Fix me for flat directory layout.  Should be ${CMAKE_INSTALL_INCLUDEDIR}
@@ -420,7 +420,7 @@ set ( CPACK_DEBIAN_DEV_PACKAGE_CONTROL_EXTRA "DEBIAN/Dev/postinst;DEBIAN/Dev/pre
 set ( CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "hsakmt-roct" )
 set ( CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "hsa-rocr" )
 if ( ROCM_DEP_ROCMCORE )
-  string ( APPEND PACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocm-core" )
+  string ( APPEND CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocm-core" )
   string ( APPEND CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ", rocm-core" )
 endif()
 set ( CPACK_DEBIAN_PACKAGE_BREAKS "hsa-ext-rocr-dev" )
@@ -438,6 +438,7 @@ endif()
 string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
 set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" )
 message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
+set( CPACK_RPM_PACKAGE_LICENSE "NCSA" )
 
 ## Process the Rpm install/remove scripts to update the CPACK variables
 configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/Binary/post.in" RPM/Binary/post @ONLY )
diff --git a/src/core/inc/agent.h b/src/core/inc/agent.h
index 08683934c..d669ab16e 100644
--- a/src/core/inc/agent.h
+++ b/src/core/inc/agent.h
@@ -53,8 +53,15 @@
 #include "core/inc/queue.h"
 #include "core/inc/memory_region.h"
 #include "core/util/utils.h"
+#include "core/util/locks.h"
 
 namespace rocr {
+
+// Forward declare AMD::MemoryRegion
+namespace AMD {
+class MemoryRegion;
+}
+
 namespace core {
 class Signal;
 
@@ -65,6 +72,8 @@ typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source,
 // replaced by tools libraries. All funtions other than Convert, node_id,
 // device_type, and public_handle must be virtual.
 class Agent : public Checked<0xF6BC25EB17E6F917> {
+  friend class rocr::AMD::MemoryRegion;
+
  public:
   // @brief Convert agent object into hsa_agent_t.
   //
@@ -297,6 +306,12 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
 
   bool profiling_enabled_;
 
+  // Used by an Agent's MemoryRegions to ensure serial memory operation on the device.
+  // Serial memory operations are needed to ensure, among other things, that allocation failures are
+  // due to true OOM conditions and per region caching (Trim and Allocate must be serial and
+  // exclusive to ensure this).
+  KernelMutex agent_memory_lock_;
+
   // Forbid copying and moving of this object
   DISALLOW_COPY_AND_ASSIGN(Agent);
 };
diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h
index cce01e658..f68241585 100644
--- a/src/core/inc/amd_memory_region.h
+++ b/src/core/inc/amd_memory_region.h
@@ -185,6 +185,8 @@ class MemoryRegion : public core::MemoryRegion {
 
   HSAuint64 virtual_size_;
 
+  // Protects against concurrent allow_access calls to fragments of the same block by virtue of all
+  // fragments of the block routing to the same MemoryRegion.
   mutable KernelMutex access_lock_;
 
   static const size_t kPageSize_ = 4096;
@@ -193,6 +195,12 @@ class MemoryRegion : public core::MemoryRegion {
   hsa_amd_memory_pool_access_t GetAccessInfo(const core::Agent& agent,
                                              const core::Runtime::LinkInfo& link_info) const;
 
+  // Operational body for Allocate.  Recursive.
+  hsa_status_t AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const;
+
+  // Operational body for Free.  Recursive.
+  hsa_status_t FreeImpl(void* address, size_t size) const;
+
   class BlockAllocator {
    private:
     MemoryRegion& region_;
@@ -200,7 +208,7 @@ class MemoryRegion : public core::MemoryRegion {
    public:
     explicit BlockAllocator(MemoryRegion& region) : region_(region) {}
     void* alloc(size_t request_size, size_t& allocated_size) const;
-    void free(void* ptr, size_t length) const { region_.Free(ptr, length); }
+    void free(void* ptr, size_t length) const { region_.FreeImpl(ptr, length); }
     size_t block_size() const { return block_size_; }
   };
 
diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h
index 71866ff8f..8d2bbf4a2 100644
--- a/src/core/inc/queue.h
+++ b/src/core/inc/queue.h
@@ -73,7 +73,7 @@ struct AqlPacket {
   }
 
   bool IsValid() const {
-    return (type() <= HSA_PACKET_TYPE_BARRIER_OR) & (type() != HSA_PACKET_TYPE_INVALID);
+    return int(type() <= HSA_PACKET_TYPE_BARRIER_OR) & (type() != HSA_PACKET_TYPE_INVALID);
   }
 
   std::string string() const {
diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
index b481f2da6..9f5b8acc5 100644
--- a/src/core/inc/runtime.h
+++ b/src/core/inc/runtime.h
@@ -487,10 +487,14 @@ class Runtime {
   /// @brief Get most recently issued SVM prefetch agent for the range in question.
   Agent* GetSVMPrefetchAgent(void* ptr, size_t size);
 
-  // Mutex object to protect multithreaded access to ::allocation_map_,
+  /// @brief Get the highest used node id.
+  uint32_t max_node_id() const { return agents_by_node_.rbegin()->first; }
+
+  // Mutex object to protect multithreaded access to ::allocation_map_.
+  // Also ensures atomicity of pointer info queries by interlocking
   // KFD map/unmap, register/unregister, and access to hsaKmtQueryPointerInfo
   // registered & mapped arrays.
-  KernelMutex memory_lock_;
+  KernelSharedMutex memory_lock_;
 
   // Array containing tools library handles.
   std::vector<os::LibHandle> tool_libs_;
diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
index bac6fc5b7..486b63cd0 100644
--- a/src/core/runtime/amd_aql_queue.cpp
+++ b/src/core/runtime/amd_aql_queue.cpp
@@ -313,7 +313,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
   MAKE_NAMED_SCOPE_GUARD(PM4IBGuard, [&]() { agent_->system_deallocator()(pm4_ib_buf_); });
 
   // Set initial CU mask
-  SetCUMasking(0, nullptr);
+  if (!core::Runtime::runtime_singleton_->flag().cu_mask_skip_init()) SetCUMasking(0, nullptr);
 
   active_ = true;
 
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index c0a70837b..1f8b70ab9 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -599,14 +599,21 @@ void GpuAgent::InitDma() {
   queues_[QueueUtility].reset(queue_lambda);
 
   // Decide which engine to use for blits.
-  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue) {
+  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
     Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
 
-    // User SDMA queues are unstable on gfx8.
-    bool use_sdma = ((isa_->GetMajorVersion() != 8));
+    // User SDMA queues are unstable on gfx8 and unsupported on gfx1013.
+    bool use_sdma =
+        ((isa_->GetMajorVersion() != 8) && (isa_->GetVersion() != std::make_tuple(10, 1, 3)));
     if (sdma_override != Flag::SDMA_DEFAULT) use_sdma = (sdma_override == Flag::SDMA_ENABLE);
 
     if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
+      // On gfx90a ensure that HostToDevice queue is created first and so is placed on SDMA0.
+      if ((!use_xgmi) && (!isHostToDev) && (isa_->GetMajorVersion() == 9) &&
+          (isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) {
+        *blits_[BlitHostToDev];
+      }
+
       auto ret = CreateBlitSdma(use_xgmi);
       if (ret != nullptr) return ret;
     }
@@ -642,13 +649,14 @@ void GpuAgent::InitDma() {
     return ret;
   });
   blits_[BlitHostToDev].reset(
-      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly]); });
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); });
   blits_[BlitDevToHost].reset(
-      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); });
 
   // XGMI engines.
   for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
-    blits_[idx].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility]); });
+    blits_[idx].reset(
+        [blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); });
   }
 
   // GWS queues.
@@ -794,7 +802,11 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
           HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR;
       break;
     case HSA_AGENT_INFO_FAST_F16_OPERATION:
-      *((bool*)value) = false;
+      if (isa_->GetMajorVersion() >= 8) {
+        *((bool*)value) = true;
+      } else {
+        *((bool*)value) = false;
+      }
       break;
     case HSA_AGENT_INFO_PROFILE:
       *((hsa_profile_t*)value) = profile_;
@@ -998,6 +1010,17 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
     case HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS:
       assert(regions_.size() != 0 && "No device local memory found!");
       *((bool*)value) = properties_.Capability.ui32.CoherentHostAccess == 1;
+    case HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT:
+      if (core::Runtime::runtime_singleton_->flag().coop_cu_count() &&
+          (isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) &&
+          (isa_->GetStepping() == 10)) {
+        uint32_t count = 0;
+        hsa_status_t err = GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &count);
+        assert(err == HSA_STATUS_SUCCESS && "CU count query failed.");
+        *((uint32_t*)value) = (count & 0xFFFFFFF8) - 8;  // value = floor(count/8)*8-8
+        break;
+      }
+      return GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, value);
     default:
       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
       break;
@@ -1504,10 +1527,11 @@ lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
   lazy_ptr<core::Blit>& blit =
     (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
      dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-       ? blits_[BlitHostToDev]
+       ? blits_[BlitHostToDev]  // CPU->GPU transfer.
        : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
           dst_agent.device_type() == core::Agent::kAmdCpuDevice)
-            ? blits_[BlitDevToHost] : blits_[BlitDevToHost];
+            ? blits_[BlitDevToHost]   // GPU->CPU transfer.
+            : blits_[BlitDevToHost];  // GPU->GPU transfer.
   return blit;
 }
 
diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
index d96e5db29..7cbb54395 100644
--- a/src/core/runtime/amd_memory_region.cpp
+++ b/src/core/runtime/amd_memory_region.cpp
@@ -157,6 +157,12 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, cor
 MemoryRegion::~MemoryRegion() {}
 
 hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const {
+  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  return AllocateImpl(size, alloc_flags, address);
+}
+
+hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
+                                        void** address) const {
   if (address == NULL) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
@@ -251,6 +257,11 @@ hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, voi
 }
 
 hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
+  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
+  return FreeImpl(address, size);
+}
+
+hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const {
   if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;
 
   MakeKfdMemoryUnresident(address);
@@ -262,6 +273,7 @@ hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
 
 // TODO:  Look into a better name and/or making this process transparent to exporting.
 hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const {
+  ScopedAcquire<KernelMutex> lock(&owner()->agent_memory_lock_);
   if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
   return HSA_STATUS_SUCCESS;
 }
@@ -583,8 +595,10 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   HsaMemMapFlags map_flag = map_flag_;
   map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0;
 
-  {
-    ScopedAcquire<KernelMutex> lock(&core::Runtime::runtime_singleton_->memory_lock_);
+  {  // Sequence with pointer info since queries to other fragments of the block may be adjusted by
+     // this call.
+    ScopedAcquire<KernelSharedMutex::Shared> lock(
+        core::Runtime::runtime_singleton_->memory_lock_.shared());
     uint64_t alternate_va = 0;
     if (!AMD::MemoryRegion::MakeKfdMemoryResident(
       whitelist_nodes.size(), &whitelist_nodes[0], ptr,
@@ -593,8 +607,6 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
     }
   }
 
-  lock.Release();
-
   return HSA_STATUS_SUCCESS;
 }
 
@@ -700,7 +712,7 @@ void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated
   void* ret;
   size_t bsize = AlignUp(request_size, block_size());
 
-  hsa_status_t err = region_.Allocate(
+  hsa_status_t err = region_.AllocateImpl(
       bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret);
   if (err != HSA_STATUS_SUCCESS)
     throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp
index 5ab779c93..5d7eac6cb 100644
--- a/src/core/runtime/hsa_ext_amd.cpp
+++ b/src/core/runtime/hsa_ext_amd.cpp
@@ -567,14 +567,14 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
                                  void** agent_ptr) {
   TRY;
   IS_OPEN();
-  *agent_ptr = NULL;
 
-  if (size == 0 || host_ptr == NULL || agent_ptr == NULL) {
+  if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
-  if ((agents != NULL && num_agent == 0) ||
-      (agents == NULL && num_agent != 0)) {
+  *agent_ptr = nullptr;
+
+  if ((agents != nullptr && num_agent == 0) || (agents == nullptr && num_agent != 0)) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
@@ -598,13 +598,14 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_
                                          void** agent_ptr) {
   TRY;
   IS_OPEN();
-  *agent_ptr = NULL;
 
-  if (size == 0 || host_ptr == NULL || agent_ptr == NULL || flags != 0) {
+  if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr || flags != 0) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
-  if ((agents != NULL && num_agent == 0) || (agents == NULL && num_agent != 0)) {
+  *agent_ptr = nullptr;
+
+  if ((agents != nullptr && num_agent == 0) || (agents == nullptr && num_agent != 0)) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
@@ -806,9 +807,13 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents,
   core::Agent** core_agents = short_agents;
   if (num_agents > tinyArraySize) {
     core_agents = new core::Agent* [num_agents];
-    if (core_agents == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    if (core_agents == nullptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
+  MAKE_SCOPE_GUARD([&]() {
+    if (num_agents > tinyArraySize) delete[] core_agents;
+  });
+
   for (uint32_t i = 0; i < num_agents; i++) {
     core::Agent* device = core::Agent::Convert(agents[i]);
     IS_VALID(device);
@@ -819,7 +824,6 @@ hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents,
       num_agents, core_agents, interop_handle, flags, size, ptr, metadata_size,
       metadata);
 
-  if (num_agents > tinyArraySize) delete[] core_agents;
   return ret;
   CATCH;
 }
diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp
index 639a13ead..e9a48f378 100755
--- a/src/core/runtime/isa.cpp
+++ b/src/core/runtime/isa.cpp
@@ -305,6 +305,9 @@ constexpr size_t hsa_name_size = 63;
   ISAREG_ENTRY_GEN("gfx1012",                10, 1, 2, unsupported, any)
   ISAREG_ENTRY_GEN("gfx1012:xnack-",         10, 1, 2, unsupported, disabled)
   ISAREG_ENTRY_GEN("gfx1012:xnack+",         10, 1, 2, unsupported, enabled)
+  ISAREG_ENTRY_GEN("gfx1013",                10, 1, 3, unsupported, any)
+  ISAREG_ENTRY_GEN("gfx1013:xnack-",         10, 1, 3, unsupported, disabled)
+  ISAREG_ENTRY_GEN("gfx1013:xnack+",         10, 1, 3, unsupported, enabled)
   ISAREG_ENTRY_GEN("gfx1030",                10, 3, 0, unsupported, unsupported)
   ISAREG_ENTRY_GEN("gfx1031",                10, 3, 1, unsupported, unsupported)
   ISAREG_ENTRY_GEN("gfx1032",                10, 3, 2, unsupported, unsupported)
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index 59980fa92..019b739ca 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -276,11 +276,10 @@ hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent,
 hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
                                      MemoryRegion::AllocateFlags alloc_flags,
                                      void** address) {
-  ScopedAcquire<KernelMutex> lock(&memory_lock_);
   hsa_status_t status = region->Allocate(size, alloc_flags, address);
-
   // Track the allocation result so that it could be freed properly.
   if (status == HSA_STATUS_SUCCESS) {
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
     allocation_map_[*address] = AllocationRegion(region, size);
   }
 
@@ -297,7 +296,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
   std::unique_ptr<std::vector<AllocationRegion::notifier_t>> notifiers;
 
   {
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
 
     std::map<const void*, AllocationRegion>::iterator it = allocation_map_.find(ptr);
 
@@ -317,26 +316,23 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
     notifiers = std::move(it->second.notifiers);
 
     allocation_map_.erase(it);
-
-    // Fast path to avoid doubling lock ops in the common case (no notifiers).
-    if (!notifiers) return region->Free(ptr, size);
   }
 
   // Notifiers can't run while holding the lock or the callback won't be able to manage memory.
   // The memory triggering the notification has already been removed from the memory map so can't
   // be double released during the callback.
-  for (auto& notifier : *notifiers) {
-    notifier.callback(notifier.ptr, notifier.user_data);
+  if (notifiers) {
+    for (auto& notifier : *notifiers) {
+      notifier.callback(notifier.ptr, notifier.user_data);
+    }
   }
 
-  // Fragment allocator requires protection.
-  ScopedAcquire<KernelMutex> lock(&memory_lock_);
   return region->Free(ptr, size);
 }
 
 hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_callback_t callback,
                                               void* user_data) {
-  ScopedAcquire<KernelMutex> lock(&memory_lock_);
+  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
   auto mem = allocation_map_.upper_bound(ptr);
   if (mem != allocation_map_.begin()) {
     mem--;
@@ -360,7 +356,7 @@ hsa_status_t Runtime::RegisterReleaseNotifier(void* ptr, hsa_amd_deallocation_ca
 hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr,
                                                 hsa_amd_deallocation_callback_t callback) {
   hsa_status_t ret = HSA_STATUS_ERROR_INVALID_ARGUMENT;
-  ScopedAcquire<KernelMutex> lock(&memory_lock_);
+  ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
   auto mem = allocation_map_.upper_bound(ptr);
   if (mem != allocation_map_.begin()) {
     mem--;
@@ -560,7 +556,7 @@ hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
   size_t alloc_size = 0;
 
   {
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
 
     std::map<const void*, AllocationRegion>::const_iterator it = allocation_map_.find(ptr);
 
@@ -786,7 +782,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
 
   {  // memory_lock protects access to the NMappedNodes array and fragment user data since these may
      // change with calls to memory APIs.
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
 
     // We don't care if this returns an error code.
     // The type will be HSA_EXT_POINTER_TYPE_UNKNOWN if so.
@@ -861,7 +857,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
   if (returnListData) {
     uint32_t count = 0;
     for (HSAuint32 i = 0; i < thunkInfo.NMappedNodes; i++) {
-      assert(mappedNodes[i] < agents_by_node_.size() &&
+      assert(mappedNodes[i] <= max_node_id() &&
              "PointerInfo: Invalid node ID returned from thunk.");
       count += agents_by_node_[mappedNodes[i]].size();
     }
@@ -886,7 +882,7 @@ hsa_status_t Runtime::PtrInfo(const void* ptr, hsa_amd_pointer_info_t* info, voi
 
 hsa_status_t Runtime::SetPtrInfoData(const void* ptr, void* userptr) {
   {  // Use allocation map if possible to handle fragments.
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
     const auto& it = allocation_map_.find(ptr);
     if (it != allocation_map_.end()) {
       it->second.user_ptr = userptr;
@@ -926,7 +922,7 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han
     // Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
     handle->handle[6] |= 0x80000000 | offset;
     // Mark block for IPC.  Prevents reallocation of exported memory.
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex::Shared> lock(memory_lock_.shared());
     hsa_status_t err = allocation_map_[ptr].region->IPCFragmentExport(ptr);
     assert(err == HSA_STATUS_SUCCESS && "Region inconsistent with address map.");
     return err;
@@ -951,11 +947,12 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
   // Extract fragment info
   bool isFragment = false;
   uint32_t fragOffset = 0;
+
   auto fixFragment = [&]() {
     if (!isFragment) return;
     importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
     len = Min(len, importSize - fragOffset);
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
     allocation_map_[importAddress] = AllocationRegion(nullptr, len);
   };
 
@@ -1017,7 +1014,7 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
 
 hsa_status_t Runtime::IPCDetach(void* ptr) {
   {  // Handle imported fragments.
-    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
     const auto& it = allocation_map_.find(ptr);
     if (it != allocation_map_.end()) {
       if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -1646,7 +1643,7 @@ hsa_status_t Runtime::SetSvmAttrib(void* ptr, size_t size,
                                    hsa_amd_svm_attribute_pair_t* attribute_list,
                                    size_t attribute_count) {
   uint32_t set_attribs = 0;
-  std::vector<bool> agent_seen(agents_by_node_.size(), false);
+  std::vector<bool> agent_seen(max_node_id() + 1, false);
 
   std::vector<HSA_SVM_ATTRIBUTE> attribs;
   attribs.reserve(attribute_count);
diff --git a/src/core/util/flag.cpp b/src/core/util/flag.cpp
index f45ea9992..f772ea982 100644
--- a/src/core/util/flag.cpp
+++ b/src/core/util/flag.cpp
@@ -111,7 +111,7 @@ CU_list = 0x[0-F]* | ID_list             ex. 0x337F OR 0,2-4,7
 CU_Set = GPU_list : CU_list              ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
 HSA_CU_MASK =  CU_Set [; CU_Set]*        ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
 
-GPU indexes are taken post ROCM_VISIBLE_DEVICES reordering.
+GPU indexes are taken post ROCR_VISIBLE_DEVICES reordering.
 Listed or bit set CUs will be enabled at queue creation on the associated GPU.
 All other CUs on the associated GPUs will be disabled.
 CU masks of unlisted GPUs are not restricted.
@@ -120,7 +120,7 @@ Repeating a GPU or CU ID is a syntax error.
 Parsing stops at the first CU_Set that has a syntax error, that set and all
 following sets are ignored.
 Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error.
-Users should use ROCM_VISIBLE_DEVICES if they want to exclude use of a
+Users should use ROCR_VISIBLE_DEVICES if they want to exclude use of a
 particular GPU.
 */
 void Flag::parse_masks(std::string& var, uint32_t maxGpu, uint32_t maxCU) {
diff --git a/src/core/util/flag.h b/src/core/util/flag.h
index 4bde01229..045a6d0c1 100644
--- a/src/core/util/flag.h
+++ b/src/core/util/flag.h
@@ -145,6 +145,14 @@ class Flag {
 
     var = os::GetEnvVar("HSA_ENABLE_DEBUG");
     debug_ = (var == "1") ? true : false;
+
+    var = os::GetEnvVar("HSA_CU_MASK_SKIP_INIT");
+    cu_mask_skip_init_ = (var == "1") ? true : false;
+
+    // Temporary opt-in for corrected HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT behavior.
+    // Will become opt-out and possibly removed in future releases.
+    var = os::GetEnvVar("HSA_COOP_CU_COUNT");
+    coop_cu_count_ = (var == "1") ? true : false;
   }
 
   void parse_masks(uint32_t maxGpu, uint32_t maxCU) {
@@ -209,6 +217,10 @@ class Flag {
     return it->second;
   }
 
+  bool cu_mask_skip_init() const { return cu_mask_skip_init_; }
+
+  bool coop_cu_count() const { return coop_cu_count_; }
+
  private:
   bool check_flat_scratch_;
   bool enable_vm_fault_message_;
@@ -227,6 +239,8 @@ class Flag {
   bool loader_enable_mmap_uri_;
   bool check_sramecc_validity_;
   bool debug_;
+  bool cu_mask_skip_init_;
+  bool coop_cu_count_;
 
   SDMA_OVERRIDE enable_sdma_;
 
diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp
index 6c2761151..b5d64fb57 100644
--- a/src/core/util/lnx/os_linux.cpp
+++ b/src/core/util/lnx/os_linux.cpp
@@ -168,6 +168,7 @@ class os_thread {
 
 static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch");
 static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch");
+static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch");
 static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch");
 
 LibHandle LoadLib(std::string filename) {
@@ -457,6 +458,63 @@ uint64_t AccurateClockFrequency() {
   if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec);
   return 1000000000ull / uint64_t(time.tv_nsec);
 }
+
+SharedMutex CreateSharedMutex() {
+  pthread_rwlockattr_t attrib;
+  int err = pthread_rwlockattr_init(&attrib);
+  if (err != 0) {
+    assert(false && "rw lock attribute init failed.");
+    return nullptr;
+  }
+  err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+  assert(err == 0 && "Set rw lock attribute failure.");
+
+  pthread_rwlock_t* lock = new pthread_rwlock_t;
+  err = pthread_rwlock_init(lock, &attrib);
+  if (err != 0) {
+    assert(false && "rw lock init failed.");
+    return nullptr;
+  }
+
+  pthread_rwlockattr_destroy(&attrib);
+  return lock;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  assert(err == 0 && "SharedMutex unlock failed.");
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock);
+  return err == 0;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
+  assert(err == 0 && "SharedMutex unlock failed.");
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock);
+  delete *(pthread_rwlock_t**)&lock;
+}
+
 }   //  namespace os
 }   //  namespace rocr
 
diff --git a/src/core/util/locks.h b/src/core/util/locks.h
index 49ad6906f..fd4d4569b 100644
--- a/src/core/util/locks.h
+++ b/src/core/util/locks.h
@@ -50,34 +50,6 @@
 
 namespace rocr {
 
-/// @brief: A class behaves as a lock in a scope. When trying to enter into the
-/// critical section, creat a object of this class. After the control path goes
-/// out of the scope, it will release the lock automatically.
-template <class LockType>
-class ScopedAcquire {
- public:
-  /// @brief: When constructing, acquire the lock.
-  /// @param: lock(Input), pointer to an existing lock.
-  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) { lock_->Acquire(); }
-
-  /// @brief: when destructing, release the lock.
-  ~ScopedAcquire() {
-    if (doRelease) lock_->Release();
-  }
-
-  /// @brief: Release the lock early.  Avoid using when possible.
-  void Release() {
-    lock_->Release();
-    doRelease = false;
-  }
-
- private:
-  LockType* lock_;
-  bool doRelease;
-  /// @brief: Disable copiable and assignable ability.
-  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
-};
-
 /// @brief: a class represents a kernel mutex.
 /// Uses the kernel's scheduler to keep the waiting thread from being scheduled
 /// until the lock is released (Best for long waits, though anything using
@@ -144,6 +116,120 @@ class KernelEvent {
   DISALLOW_COPY_AND_ASSIGN(KernelEvent);
 };
 
+/// @brief: represents a yielding shared mutex.
+/// aka read/write mutex
+class KernelSharedMutex {
+ public:
+  /// @brief: Interfaces ScopedAcquire to shared operations.
+  class Shared {
+   public:
+    explicit Shared(KernelSharedMutex* lock) : lock_(lock) {}
+    bool Try() { return lock_->TryShared(); }
+    bool Acquire() { return lock_->AcquireShared(); }
+    void Release() { lock_->ReleaseShared(); }
+
+   private:
+    KernelSharedMutex* lock_;
+  };
+
+  KernelSharedMutex() { lock_ = os::CreateSharedMutex(); }
+  ~KernelSharedMutex() { os::DestroySharedMutex(lock_); }
+
+  // Exclusive mode operations
+  bool Try() { return os::TryAcquireSharedMutex(lock_); }
+  bool Acquire() { return os::AcquireSharedMutex(lock_); }
+  void Release() { os::ReleaseSharedMutex(lock_); }
+
+  // Shared mode operations
+  bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); }
+  bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); }
+  void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); }
+
+  // Return shared operations interface
+  Shared shared() { return Shared(this); }
+
+ private:
+  os::SharedMutex lock_;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex);
+};
+
+/// @brief: Type trait to identify mutex types
+template <class T> class isMutex {
+ public:
+  enum { value = false };
+};
+template <> class isMutex<KernelMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<SpinMutex> {
+ public:
+  enum { value = true };
+};
+template <> class isMutex<KernelSharedMutex> {
+ public:
+  enum { value = true };
+};
+
+/// @brief: A class behaves as a lock in a scope. When trying to enter into the
+/// critical section, creat a object of this class. After the control path goes
+/// out of the scope, it will release the lock automatically.
+template <class LockType> class ScopedAcquire {
+ public:
+  /// @brief: When constructing, acquire the lock.
+  /// @param: lock(Input), pointer to an existing lock.
+  explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
+    static_assert(isMutex<LockType>::value, "ScopedAcquire requires a mutex type.");
+    lock_.Acquire();
+  }
+  explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) {
+    static_assert(!isMutex<LockType>::value, "Mutex types are not copyable.");
+    lock_.Acquire();
+  }
+
+  /// @brief: when destructing, release the lock.
+  ~ScopedAcquire() {
+    if (doRelease) lock_.Release();
+  }
+
+  /// @brief: Release the lock early.  Avoid using when possible.
+  void Release() {
+    lock_.Release();
+    doRelease = false;
+  }
+
+ private:
+  /// @brief: Adapts between pointers to mutex types and mutex pointer types.
+  template <class T, bool B> class container {
+   public:
+    container(T* lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_->Acquire(); }
+    __forceinline void Release() { return lock_->Release(); }
+
+   private:
+    T* lock_;
+  };
+
+  /// @brief: Specialization for mutex pointer types.
+  template <class T> class container<T, false> {
+   public:
+    container(T lock) : lock_(lock) {}
+    __forceinline bool Acquire() { return lock_.Acquire(); }
+    __forceinline void Release() { return lock_.Release(); }
+
+   private:
+    T lock_;
+  };
+
+  container<LockType, isMutex<LockType>::value> lock_;
+  bool doRelease;
+
+  /// @brief: Disable copiable and assignable ability.
+  DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
+};
+
 }  // namespace rocr
 
 #endif  // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_
diff --git a/src/core/util/os.h b/src/core/util/os.h
index ec9f54a42..134ee9a87 100644
--- a/src/core/util/os.h
+++ b/src/core/util/os.h
@@ -52,6 +52,7 @@ namespace rocr {
 namespace os {
 typedef void* LibHandle;
 typedef void* Mutex;
+typedef void* SharedMutex;
 typedef void* Thread;
 typedef void* EventHandle;
 
@@ -110,6 +111,48 @@ void ReleaseMutex(Mutex lock);
 /// @return: void.
 void DestroyMutex(Mutex lock);
 
+/// @brief: Creates a shared mutex, will return NULL if failed.
+/// @param: void.
+/// @return: SharedMutex.
+SharedMutex CreateSharedMutex();
+
+/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true.
+/// @param: lock(Input), handle to the shared mutex.
+/// @return: bool.
+bool TryAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is
+/// released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool AcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from exclusive mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void ReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool TrySharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it
+/// is released. If the mutex is acquired successfully, it will return true.
+/// @param: lock(Input), handle to the mutex.
+/// @return: bool.
+bool SharedAcquireSharedMutex(SharedMutex lock);
+
+/// @brief: Releases the mutex from shared mode.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void SharedReleaseSharedMutex(SharedMutex lock);
+
+/// @brief: Destroys the mutex.
+/// @param: lock(Input), handle to the mutex.
+/// @return: void.
+void DestroySharedMutex(SharedMutex lock);
+
 /// @brief: Puts current thread to sleep.
 /// @param: delayInMs(Input), time in millisecond for sleeping.
 /// @return: void.
diff --git a/src/core/util/utils.h b/src/core/util/utils.h
index 3c95d4545..b1901b652 100644
--- a/src/core/util/utils.h
+++ b/src/core/util/utils.h
@@ -158,6 +158,7 @@ class ScopeGuard {
     dismiss_ = rhs.dismiss_;
     release_ = rhs.release_;
     rhs.dismiss_ = true;
+    return *this;
   }
   __forceinline void Dismiss() { dismiss_ = true; }
 
diff --git a/src/core/util/win/os_win.cpp b/src/core/util/win/os_win.cpp
index 226b54b4a..cc543d528 100644
--- a/src/core/util/win/os_win.cpp
+++ b/src/core/util/win/os_win.cpp
@@ -224,6 +224,52 @@ uint64_t AccurateClockFrequency() {
   QueryPerformanceFrequency((LARGE_INTEGER*)&ret);
   return ret;
 }
+
+SharedMutex CreateSharedMutex() {
+  assert(false && "Not implemented.");
+  abort();
+  return nullptr;
+}
+
+bool TryAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool AcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void ReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+bool TrySharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+bool SharedAcquireSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+  return false;
+}
+
+void SharedReleaseSharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
+void DestroySharedMutex(SharedMutex lock) {
+  assert(false && "Not implemented.");
+  abort();
+}
+
 }   //  namespace os
 }   //  namespace rocr
 
diff --git a/src/image/blit_kernel.cpp b/src/image/blit_kernel.cpp
index 95f2b51c2..4737b7cb6 100644
--- a/src/image/blit_kernel.cpp
+++ b/src/image/blit_kernel.cpp
@@ -88,6 +88,7 @@ extern uint8_t ocl_blit_object_gfx90c[];
 extern uint8_t ocl_blit_object_gfx1010[];
 extern uint8_t ocl_blit_object_gfx1011[];
 extern uint8_t ocl_blit_object_gfx1012[];
+extern uint8_t ocl_blit_object_gfx1013[];
 extern uint8_t ocl_blit_object_gfx1030[];
 extern uint8_t ocl_blit_object_gfx1031[];
 extern uint8_t ocl_blit_object_gfx1032[];
@@ -1007,6 +1008,8 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
     *blit_code_object = ocl_blit_object_gfx1011;
   } else if (sname == "gfx1012") {
     *blit_code_object = ocl_blit_object_gfx1012;
+  } else if (sname == "gfx1013") {
+    *blit_code_object = ocl_blit_object_gfx1013;
   } else if (sname == "gfx1030") {
     *blit_code_object = ocl_blit_object_gfx1030;
   } else if (sname == "gfx1031") {
diff --git a/src/image/blit_src/CMakeLists.txt b/src/image/blit_src/CMakeLists.txt
index 3f334c8e3..068995282 100644
--- a/src/image/blit_src/CMakeLists.txt
+++ b/src/image/blit_src/CMakeLists.txt
@@ -74,7 +74,7 @@ endif()
 
 # Determine the target devices if not specified
 if (NOT DEFINED TARGET_DEVICES)
-  set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035")
+  set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035")
 endif()
 set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )
 
diff --git a/src/image/util.h b/src/image/util.h
index 5eb9f3aab..8c0faaad8 100644
--- a/src/image/util.h
+++ b/src/image/util.h
@@ -130,6 +130,7 @@ template <typename lambda> class ScopeGuard {
     dismiss_ = rhs.dismiss_;
     release_ = rhs.release_;
     rhs.dismiss_ = true;
+    return *this;
   }
   __forceinline void Dismiss() { dismiss_ = true; }
 
diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h
index 227773c9a..ec5229e02 100644
--- a/src/inc/amd_hsa_elf.h
+++ b/src/inc/amd_hsa_elf.h
@@ -120,10 +120,11 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X41 = 0x041,
+  EF_AMDGPU_MACH_AMDGCN_GFX1013       = 0x042,
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1013,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h
index bb568bdc9..6c1a936e5 100644
--- a/src/inc/hsa_ext_amd.h
+++ b/src/inc/hsa_ext_amd.h
@@ -311,7 +311,13 @@ typedef enum hsa_amd_agent_info_s {
    * physically resident in the agent's local memory.
    * The type of this attribute is bool.
    */
-  HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013
+  HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013,
+  /**
+   * Some processors support more CUs than can reliably be used in a cooperative
+   * dispatch.  This queries the count of CUs which are fully enabled for
+   * cooperative dispatch.
+   */
+  HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014
 } hsa_amd_agent_info_t;
 
 typedef struct hsa_amd_hdp_flush_s {
diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp
index 7ca8be13f..2a3391b2a 100644
--- a/src/libamdhsacode/amd_hsa_code.cpp
+++ b/src/libamdhsacode/amd_hsa_code.cpp
@@ -573,6 +573,7 @@ namespace code {
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true;  sramecc_supported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true;  sramecc_supported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: name = "gfx1012"; xnack_supported = true;  sramecc_supported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013: name = "gfx1013"; xnack_supported = true;  sramecc_supported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: name = "gfx1030"; xnack_supported = false; sramecc_supported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: name = "gfx1031"; xnack_supported = false; sramecc_supported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: name = "gfx1032"; xnack_supported = false; sramecc_supported = false; break;