diff --git a/sample/vector_copy.c b/sample/vector_copy.c
index 44e1795f0..d1de531bb 100644
--- a/sample/vector_copy.c
+++ b/sample/vector_copy.c
@@ -49,7 +49,7 @@
 
 #define check(msg, status) \
 if (status != HSA_STATUS_SUCCESS) { \
-    printf("%s failed: %x.\n", #msg, status); \
+    printf("%s failed.\n", #msg); \
     exit(1); \
 } else { \
    printf("%s succeeded.\n", #msg); \
@@ -230,11 +230,10 @@ int main(int argc, char **argv) {
      */
     hsa_ext_module_t module;
     if(HSA_PROFILE_FULL == profile) {
-        err = load_module_from_file("vector_copy_full.brig",&module);
+        load_module_from_file("vector_copy_full.brig",&module);
     } else {
-        err = load_module_from_file("vector_copy_base.brig",&module);
+        load_module_from_file("vector_copy_base.brig",&module);
     }
-    check(Load module from file, err);
 
     /*
      * Create hsa program.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7c9ef0c84..d9a98c972 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -192,7 +192,7 @@ set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pos
 
 ## RPM package specific variables
 set ( CPACK_RPM_PACKAGE_DEPENDS "hsakmt-roct-dev" )
-set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
+set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
 set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" )
 
 ## Include packaging
diff --git a/src/cmake_modules/hsa_common.cmake b/src/cmake_modules/hsa_common.cmake
index 26971c306..0b907bf3d 100644
--- a/src/cmake_modules/hsa_common.cmake
+++ b/src/cmake_modules/hsa_common.cmake
@@ -55,7 +55,7 @@ if(UNIX)
   set(PS ":")
   set(CMAKE_CXX_FLAGS "-Wall -std=c++11 ${EXTRA_CFLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-undefined")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--unresolved-symbols=ignore-in-shared-libs")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing")
   if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" )
     set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64  -msse -msse2" )
diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h
index d85d9443a..d2321dfd4 100644
--- a/src/core/inc/amd_memory_region.h
+++ b/src/core/inc/amd_memory_region.h
@@ -86,13 +86,11 @@ class MemoryRegion : public core::MemoryRegion {
   static void DeregisterMemory(void* ptr);
 
   /// @brief Pin memory.
-  static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes,
-                                    void* ptr, size_t size,
-                                    uint64_t* alternate_va,
-                                    HsaMemMapFlags map_flag);
+  static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr,
+                                    size_t size, uint64_t* alternate_va, HsaMemMapFlags map_flag);
 
   /// @brief Unpin memory.
-  static void MakeKfdMemoryUnresident(void* ptr);
+  static void MakeKfdMemoryUnresident(const void* ptr);
 
   MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
                const HsaMemoryProperties& mem_props);
diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
index c8d00bcb6..06eff531a 100644
--- a/src/core/inc/runtime.h
+++ b/src/core/inc/runtime.h
@@ -315,12 +315,11 @@ class Runtime {
   static void AsyncEventsLoop(void*);
 
   struct AllocationRegion {
-    AllocationRegion() : region(NULL), assigned_agent_(NULL), size(0) {}
+    AllocationRegion() : region(NULL), size(0) {}
     AllocationRegion(const MemoryRegion* region_arg, size_t size_arg)
-        : region(region_arg), assigned_agent_(NULL), size(size_arg) {}
+        : region(region_arg), size(size_arg) {}
 
     const MemoryRegion* region;
-    const Agent* assigned_agent_;
     size_t size;
   };
 
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index 08ea936a5..1fa48a94d 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -566,7 +566,7 @@ void GpuAgent::InitDma() {
     if (!blit_initialized_.load(std::memory_order_relaxed)) {
       // Try create SDMA blit first.
       // TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified.
-      if ((isa_->GetMajorVersion() != 9) &&
+      if ((isa_->GetMajorVersion() != 8) &&
           core::Runtime::runtime_singleton_->flag().enable_sdma() &&
           (HSA_PROFILE_BASE == profile_)) {
         blits_[BlitHostToDev] = CreateBlitSdma();
diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
index 295be378d..00642f3de 100644
--- a/src/core/runtime/amd_memory_region.cpp
+++ b/src/core/runtime/amd_memory_region.cpp
@@ -81,23 +81,21 @@ bool MemoryRegion::RegisterMemory(void* ptr, size_t size, size_t num_nodes,
 
 void MemoryRegion::DeregisterMemory(void* ptr) { hsaKmtDeregisterMemory(ptr); }
 
-bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes,
-                                         void* ptr, size_t size,
-                                         uint64_t* alternate_va,
+bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr,
+                                         size_t size, uint64_t* alternate_va,
                                          HsaMemMapFlags map_flag) {
   assert(num_node > 0);
   assert(nodes != NULL);
 
   *alternate_va = 0;
-  const HSAKMT_STATUS status =
-      hsaKmtMapMemoryToGPUNodes(ptr, size, alternate_va, map_flag, num_node,
-                                const_cast<uint32_t*>(nodes));
+  const HSAKMT_STATUS status = hsaKmtMapMemoryToGPUNodes(
+      const_cast<void*>(ptr), size, alternate_va, map_flag, num_node, const_cast<uint32_t*>(nodes));
 
   return (status == HSAKMT_STATUS_SUCCESS);
 }
 
-void MemoryRegion::MakeKfdMemoryUnresident(void* ptr) {
-  hsaKmtUnmapMemoryToGPU(ptr);
+void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
+  hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
 }
 
 MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,
@@ -454,15 +452,16 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   if (whitelist_nodes.size() == 0 && IsSystem()) {
     assert(cpu_in_list);
     // This is a system region and only CPU agents in the whitelist.
-    // No need to call map.
+    // Remove old mappings.
+    amd::MemoryRegion::MakeKfdMemoryUnresident(ptr);
     return HSA_STATUS_SUCCESS;
   }
 
   // If this is a local memory region, the owning gpu always needs to be in
   // the whitelist.
   if (IsPublic() &&
-      std::find(whitelist_nodes.begin(), whitelist_nodes.end(),
-                owner()->node_id()) == whitelist_nodes.end()) {
+      std::find(whitelist_nodes.begin(), whitelist_nodes.end(), owner()->node_id()) ==
+          whitelist_nodes.end()) {
     whitelist_nodes.push_back(owner()->node_id());
     whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(owner()));
   }
@@ -470,12 +469,11 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   HsaMemMapFlags map_flag = map_flag_;
   map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0;
 
-  
   {
     ScopedAcquire<KernelMutex> lock(&core::Runtime::runtime_singleton_->memory_lock_);
     uint64_t alternate_va = 0;
     if (!amd::MemoryRegion::MakeKfdMemoryResident(
-      whitelist_nodes.size(), &whitelist_nodes[0], const_cast<void*>(ptr),
+      whitelist_nodes.size(), &whitelist_nodes[0], ptr,
       size, &alternate_va, map_flag)) {
         return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp
index 363d6c20d..a4a15fa03 100644
--- a/src/core/runtime/isa.cpp
+++ b/src/core/runtime/isa.cpp
@@ -196,10 +196,14 @@ const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() {
 
   ISAREG_ENTRY_GEN(7, 0, 0)
   ISAREG_ENTRY_GEN(7, 0, 1)
+  ISAREG_ENTRY_GEN(7, 0, 2)
   ISAREG_ENTRY_GEN(8, 0, 1)
   ISAREG_ENTRY_GEN(8, 0, 2)
   ISAREG_ENTRY_GEN(8, 0, 3)
   ISAREG_ENTRY_GEN(9, 0, 0)
+  ISAREG_ENTRY_GEN(9, 0, 1)
+  ISAREG_ENTRY_GEN(9, 0, 2)
+  ISAREG_ENTRY_GEN(9, 0, 3)
 
   return supported_isas;
 }
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index f2cf2d55c..442fe82f5 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -453,8 +453,42 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
 }
 
 hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) {
-  assert(blit_agent_ != NULL);
-  return blit_agent_->DmaFill(ptr, value, count);
+  // Choose blit agent from pointer info
+  hsa_amd_pointer_info_t info;
+  uint32_t agent_count;
+  hsa_agent_t* accessible = nullptr;
+  info.size = sizeof(info);
+  MAKE_SCOPE_GUARD([&]() { free(accessible); });
+  hsa_status_t err = PtrInfo(ptr, &info, malloc, &agent_count, &accessible);
+  if (err != HSA_STATUS_SUCCESS) return err;
+
+  ptrdiff_t endPtr = (ptrdiff_t)ptr + count * sizeof(uint32_t);
+
+  // Check for GPU fill
+  // Selects GPU fill for SVM and Locked allocations if a GPU address is given and is mapped.
+  if (info.agentBaseAddress <= ptr &&
+      endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
+    core::Agent* blit_agent = core::Agent::Convert(info.agentOwner);
+    if (blit_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) {
+      blit_agent = nullptr;
+      for (int i = 0; i < agent_count; i++) {
+        if (core::Agent::Convert(accessible[i])->device_type() ==
+            core::Agent::DeviceType::kAmdGpuDevice) {
+          blit_agent = core::Agent::Convert(accessible[i]);
+          break;
+        }
+      }
+    }
+    if (blit_agent) return blit_agent->DmaFill(ptr, value, count);
+  }
+
+  // Host and unmapped SVM addresses copy via host.
+  if (info.hostBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.hostBaseAddress + info.sizeInBytes) {
+    memset(ptr, value, count * sizeof(uint32_t));
+    return HSA_STATUS_SUCCESS;
+  }
+
+  return HSA_STATUS_ERROR_INVALID_ALLOCATION;
 }
 
 hsa_status_t Runtime::AllowAccess(uint32_t num_agents,
@@ -646,6 +680,8 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
   HsaPointerInfo thunkInfo;
   uint32_t* mappedNodes;
 
+  hsa_amd_pointer_info_t retInfo;
+
   // check output struct is at least as large as the first info revision.
   if (info->size < sizeof(struct hsa_amd_pointer_info_v1_s)) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
@@ -674,12 +710,15 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
   static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS,
                 "Thunk pointer info mismatch");
 
-  info->size = Min(info->size, sizeof(struct hsa_amd_pointer_info_v1_s));
-  info->type = (hsa_amd_pointer_type_t)thunkInfo.Type;
-  info->agentBaseAddress = (void*)thunkInfo.GPUAddress;
-  info->hostBaseAddress = thunkInfo.CPUAddress;
-  info->sizeInBytes = thunkInfo.SizeInBytes;
-  info->userData = thunkInfo.UserData;
+  retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
+  retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
+  retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
+  retInfo.hostBaseAddress = thunkInfo.CPUAddress;
+  retInfo.sizeInBytes = thunkInfo.SizeInBytes;
+  retInfo.userData = thunkInfo.UserData;
+  retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
+
+  memcpy(info, &retInfo, retInfo.size);
 
   if (returnListData) {
     uint32_t count = 0;
diff --git a/src/core/util/flag.h b/src/core/util/flag.h
index d7add470c..ce256e6bd 100644
--- a/src/core/util/flag.h
+++ b/src/core/util/flag.h
@@ -69,18 +69,9 @@ class Flag {
     var = os::GetEnvVar("HSA_ENABLE_INTERRUPT");
     enable_interrupt_ = (var == "0") ? false : true;
 
-    var = os::GetEnvVar("HSA_ENABLE_THREAD_TRACE");
-    enable_thread_trace_ = (var == "1") ? true : false;
-
-    var = os::GetEnvVar("HSA_THREAD_TRACE_MEM_SIZE");
-    thread_trace_buff_size_ = atoi(var.c_str());
-
     var = os::GetEnvVar("HSA_ENABLE_SDMA");
     enable_sdma_ = (var == "0") ? false : true;
 
-    var = os::GetEnvVar("HSA_EMULATE_AQL");
-    emulate_aql_ = (var == "1") ? true : false;
-
     var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND");
     running_valgrind_ = (var == "1") ? true : false;
 
@@ -104,14 +95,8 @@ class Flag {
 
   bool enable_interrupt() const { return enable_interrupt_; }
 
-  bool enable_thread_trace() const { return enable_thread_trace_; }
-
-  bool thread_trace_buff_size() const { return thread_trace_buff_size_; }
-
   bool enable_sdma() const { return enable_sdma_; }
 
-  bool emulate_aql() const { return emulate_aql_; }
-
   bool running_valgrind() const { return running_valgrind_; }
 
   bool sdma_wait_idle() const { return sdma_wait_idle_; }
@@ -127,14 +112,10 @@ class Flag {
   bool enable_vm_fault_message_;
   bool enable_interrupt_;
   bool enable_sdma_;
-  bool emulate_aql_;
   bool running_valgrind_;
   bool sdma_wait_idle_;
   bool enable_queue_fault_message_;
 
-  bool enable_thread_trace_;
-  size_t thread_trace_buff_size_;
-
   uint32_t max_queues_;
 
   size_t scratch_mem_size_;
diff --git a/src/core/util/timer.h b/src/core/util/timer.h
index 65b9dfb6b..914bda34e 100644
--- a/src/core/util/timer.h
+++ b/src/core/util/timer.h
@@ -147,8 +147,7 @@ class fast_clock {
 #ifdef __x86_64__
   static __forceinline raw_rep raw_now() { return __rdtsc(); }
   static __forceinline raw_frequency raw_freq() { return freq; }
-#endif
-#ifdef __aarch64__
+#else
   static __forceinline raw_rep raw_now() {
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
diff --git a/src/core/util/utils.h b/src/core/util/utils.h
index b3e60a13e..536d81632 100644
--- a/src/core/util/utils.h
+++ b/src/core/util/utils.h
@@ -56,11 +56,6 @@ typedef uint64_t uint64;
 #if defined(__GNUC__)
 #if defined(__i386__) || defined(__x86_64__)
 #include <x86intrin.h>
-#elif defined(__aarch64__)
-#else
-#error \
-    "Processor or compiler not identified.  " \
-    "Need to provide a lightweight approximate clock interface via function uint64_t __rdtsc() or adapt timer.h to your platform."
 #endif
 
 #define __forceinline __inline__ __attribute__((always_inline))
diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h
index e184ff15c..85bddcc51 100755
--- a/src/inc/hsa_ext_amd.h
+++ b/src/inc/hsa_ext_amd.h
@@ -1164,14 +1164,17 @@ hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
  *
  * @param[in] count Number of uint32_t element to be set to the value.
  *
- * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ * @retval HSA_STATUS_SUCCESS The function has been executed successfully.
  *
- * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
  * initialized.
  *
- * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
  * not 4 bytes aligned
  *
+ * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory
+ * region was not allocated with HSA runtime APIs.
+ *
  */
 hsa_status_t HSA_API
     hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
@@ -1346,6 +1349,23 @@ typedef struct hsa_amd_pointer_info_v1_s {
   Application provided value.
   */
   void* userData;
+} hsa_amd_pointer_info_v1_t;
+
+/**
+ * @brief Minor version updates to pointer info.
+ */
+#ifdef __cplusplus
+typedef struct hsa_amd_pointer_info_v2_s : hsa_amd_pointer_info_v1_t {
+#else
+typedef struct hsa_amd_pointer_info_v2_t {
+  struct hsa_amd_pointer_info_v1_t;
+#endif
+  /*
+  Reports an agent which "owns" (ie has preferred access to) the pool in which the allocation was
+  made.  When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die
+  GPU boards) any such agent may be returned.
+  */
+  hsa_agent_t agentOwner;
 } hsa_amd_pointer_info_t;
 
 /**
diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp
index 5780e3b99..5579a54f0 100644
--- a/src/libamdhsacode/amd_hsa_code.cpp
+++ b/src/libamdhsacode/amd_hsa_code.cpp
@@ -1505,15 +1505,11 @@ namespace code {
 
     bool AmdHsaCode::PullElfV2()
     {
-      Segment* note = NULL;
       for (size_t i = 0; i < img->segmentCount(); ++i) {
         Segment* s = img->segment(i);
         if (s->type() == PT_LOAD) {
           dataSegments.push_back(s);
         }
-        else if (s->type() == PT_NOTE && s->align() >= 4) {
-          note = s;
-        }
       }
       for (size_t i = 0; i < img->sectionCount(); ++i) {
         Section* sec = img->section(i);