From 36f9c49c3922634ae045340f2c3c7452a7198e62 Mon Sep 17 00:00:00 2001
From: James Edwards <JamesAdrian.Edwards@amd.com>
Date: Wed, 9 May 2018 12:57:52 -0500
Subject: [PATCH] ROCm 1.8.0 updates

---
 src/core/inc/amd_aql_queue.h           | 163 +-------------
 src/core/inc/amd_blit_kernel.h         |  12 +-
 src/core/inc/amd_blit_sdma.h           |  59 +++--
 src/core/inc/amd_elf_image.hpp         |   1 +
 src/core/inc/amd_gpu_agent.h           |  56 +++--
 src/core/inc/amd_hsa_code.hpp          |   1 +
 src/core/inc/blit.h                    |  11 +-
 src/core/inc/intercept_queue.h         | 148 +------------
 src/core/inc/isa.h                     |  35 ++-
 src/core/inc/queue.h                   |   6 +-
 src/core/inc/runtime.h                 |   2 +-
 src/core/inc/signal.h                  | 172 +++++++++++++++
 src/core/runtime/amd_aql_queue.cpp     | 261 +++++++++++++---------
 src/core/runtime/amd_blit_kernel.cpp   |  57 +----
 src/core/runtime/amd_blit_sdma.cpp     | 156 ++++++++++++--
 src/core/runtime/amd_gpu_agent.cpp     | 262 ++++++++++++++--------
 src/core/runtime/amd_memory_region.cpp |  23 +-
 src/core/runtime/hsa.cpp               | 101 ++++++++-
 src/core/runtime/hsa_ext_amd.cpp       |  10 +
 src/core/runtime/intercept_queue.cpp   |   2 +-
 src/core/runtime/isa.cpp               |  48 +++--
 src/core/runtime/runtime.cpp           |  76 ++++++-
 src/core/util/flag.h                   |  16 +-
 src/core/util/lazy_ptr.h               | 125 +++++++++++
 src/core/util/small_heap.cpp           | 185 ++++++++--------
 src/core/util/small_heap.h             |  81 ++++---
 src/inc/amd_hsa_common.h               |   8 +-
 src/inc/amd_hsa_elf.h                  |   5 +
 src/inc/amd_hsa_queue.h                |   3 +-
 src/inc/hsa.h                          | 287 +++++++++++++------------
 src/inc/hsa_ext_amd.h                  |  11 +-
 src/libamdhsacode/amd_elf_image.cpp    |   1 +
 src/loader/executable.cpp              |  94 +++++++-
 src/loader/loaders.cpp                 |  26 +--
 src/loader/loaders.hpp                 |   6 +-
 35 files changed, 1517 insertions(+), 993 deletions(-)
 create mode 100644 src/core/util/lazy_ptr.h
 mode change 100755 => 100644 src/inc/hsa_ext_amd.h
 mode change 100755 => 100644 src/loader/loaders.cpp

diff --git a/src/core/inc/amd_aql_queue.h b/src/core/inc/amd_aql_queue.h
index 104a0272c..795928bfb 100644
--- a/src/core/inc/amd_aql_queue.h
+++ b/src/core/inc/amd_aql_queue.h
@@ -53,7 +53,7 @@ namespace amd {
 /// @brief Encapsulates HW Aql Command Processor functionality. It
 /// provide the interface for things such as Doorbell register, read,
 /// write pointers and a buffer.
-class AqlQueue : public core::Queue, private core::LocalSignal, public core::Signal {
+class AqlQueue : public core::Queue, private core::LocalSignal, public core::DoorbellSignal {
  public:
   static __forceinline bool IsType(core::Signal* signal) {
     return signal->IsType(&rtti_id_);
@@ -183,164 +183,15 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Sig
   // @brief Submits a block of PM4 and waits until it has been executed.
   void ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) override;
 
-  /// @brief This operation is illegal
-  hsa_signal_value_t LoadRelaxed() override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t LoadAcquire() override {
-    assert(false);
-    return 0;
-  }
-
   /// @brief Update signal value using Relaxed semantics
   void StoreRelaxed(hsa_signal_value_t value) override;
 
   /// @brief Update signal value using Release semantics
   void StoreRelease(hsa_signal_value_t value) override;
 
-  /// @brief This operation is illegal
-  hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value,
-                                 uint64_t timeout, hsa_wait_state_t wait_hint) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value,
-                                 uint64_t timeout, hsa_wait_state_t wait_hint) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  void AndRelaxed(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AndAcquire(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AndRelease(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AndAcqRel(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrRelaxed(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrAcquire(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrRelease(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrAcqRel(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorRelaxed(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorAcquire(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorRelease(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorAcqRel(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddRelaxed(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddAcquire(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddRelease(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddAcqRel(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubRelaxed(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubAcquire(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubRelease(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubAcqRel(hsa_signal_value_t value) override { assert(false); }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchRelease(hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasRelease(hsa_signal_value_t expected, hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, hsa_signal_value_t value) override {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t* ValueLocation() const override {
-    assert(false);
-    return NULL;
-  }
-
-  /// @brief This operation is illegal
-  HsaEvent* EopEvent() override {
-    assert(false);
-    return NULL;
-  }
-
  protected:
   bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; }
 
-  /// @brief Disallow destroying doorbell apart from its queue.
-  void doDestroySignal() override { assert(false); }
-
  private:
   uint32_t ComputeRingBufferMinPkts();
   uint32_t ComputeRingBufferMaxPkts();
@@ -353,12 +204,16 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Sig
   void CloseRingBufferFD(const char* ring_buf_shm_path, int fd) const;
   int CreateRingBufferFD(const char* ring_buf_shm_path, uint32_t ring_buf_phys_size_bytes) const;
 
-  static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg);
-
   /// @brief Define the Scratch Buffer Descriptor and related parameters
   /// that enable kernel access scratch memory
   void InitScratchSRD();
 
+  /// @brief Halt the queue without destroying it or fencing memory.
+  void Suspend();
+
+  /// @brief Handler for hardware queue events.
+  static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg);
+
   // AQL packet ring buffer
   void* ring_buf_;
 
@@ -395,6 +250,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Sig
   uint32_t pm4_ib_size_b_;
   KernelMutex pm4_ib_mutex_;
 
+  // Error handler control variable.
+  std::atomic<uint32_t> dynamicScratchState;
+  enum { ERROR_HANDLER_DONE = 1, ERROR_HANDLER_TERMINATE = 2, ERROR_HANDLER_SCRATCH_RETRY = 4 };
+
   // Shared event used for queue errors
   static HsaEvent* queue_event_;
 
diff --git a/src/core/inc/amd_blit_kernel.h b/src/core/inc/amd_blit_kernel.h
index 9d7090940..6ebc43640 100644
--- a/src/core/inc/amd_blit_kernel.h
+++ b/src/core/inc/amd_blit_kernel.h
@@ -76,24 +76,26 @@ class BlitKernel : public core::Blit {
   /// @brief Submit an AQL packet to perform vector copy. The call is blocking
   /// until the command execution is finished.
   ///
+  /// @param p2p true if it is a peer-to-peer copy
   /// @param dst Memory address of the copy destination.
   /// @param src Memory address of the copy source.
   /// @param size Size of the data to be copied.
-  virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src,
-                                               size_t size) override;
+  virtual hsa_status_t SubmitLinearCopyCommand(bool p2p, void* dst,
+                                               const void* src, size_t size) override;
 
   /// @brief Submit a linear copy command to the the underlying compute device's
   /// control block. The call is non blocking. The memory transfer will start
   /// after all dependent signals are satisfied. After the transfer is
   /// completed, the out signal will be decremented.
   ///
+  /// @param p2p true if it is a peer-to-peer copy
   /// @param dst Memory address of the copy destination.
   /// @param src Memory address of the copy source.
   /// @param size Size of the data to be copied.
   /// @param dep_signals Arrays of dependent signal.
   /// @param out_signal Output signal.
   virtual hsa_status_t SubmitLinearCopyCommand(
-      void* dst, const void* src, size_t size,
+      bool p2p, void* dst, const void* src, size_t size,
       std::vector<core::Signal*>& dep_signals,
       core::Signal& out_signal) override;
 
@@ -152,10 +154,6 @@ class BlitKernel : public core::Blit {
   /// packet processor doesn't get invalid packet.
   void ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet);
 
-  /// Wait until all packets are finished.
-  hsa_status_t FenceRelease(uint64_t write_index, uint32_t num_copy_packet,
-                            hsa_fence_scope_t fence);
-
   void PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
                      uint32_t grid_size_x, hsa_signal_t completion_signal);
 
diff --git a/src/core/inc/amd_blit_sdma.h b/src/core/inc/amd_blit_sdma.h
index fb0eb1abf..95fed4da4 100644
--- a/src/core/inc/amd_blit_sdma.h
+++ b/src/core/inc/amd_blit_sdma.h
@@ -61,6 +61,7 @@ class BlitSdmaBase : public core::Blit {
   static const size_t kCopyPacketSize;
   static const size_t kMaxSingleCopySize;
   static const size_t kMaxSingleFillSize;
+  virtual bool isSDMA() const override { return true; }
 };
 
 // RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
@@ -69,7 +70,7 @@ class BlitSdmaBase : public core::Blit {
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 class BlitSdma : public BlitSdmaBase {
  public:
-  explicit BlitSdma();
+  explicit BlitSdma(bool copy_direction);
 
   virtual ~BlitSdma() override;
 
@@ -94,24 +95,26 @@ class BlitSdma : public BlitSdmaBase {
 
   /// @brief Submit a linear copy command to the queue buffer.
   ///
+  /// @param p2p true if it is a peer-to-peer copy
   /// @param dst Memory address of the copy destination.
   /// @param src Memory address of the copy source.
   /// @param size Size of the data to be copied.
-  virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src,
-                                               size_t size) override;
+  virtual hsa_status_t SubmitLinearCopyCommand(bool p2p, void* dst,
+                                               const void* src, size_t size) override;
 
   /// @brief Submit a linear copy command to the the underlying compute device's
   /// control block. The call is non blocking. The memory transfer will start
   /// after all dependent signals are satisfied. After the transfer is
   /// completed, the out signal will be decremented.
   ///
+  /// @param p2p true if it is a peer-to-peer copy
   /// @param dst Memory address of the copy destination.
   /// @param src Memory address of the copy source.
   /// @param size Size of the data to be copied.
   /// @param dep_signals Arrays of dependent signal.
   /// @param out_signal Output signal.
   virtual hsa_status_t SubmitLinearCopyCommand(
-      void* dst, const void* src, size_t size,
+      bool p2p, void* dst, const void* src, size_t size,
       std::vector<core::Signal*>& dep_signals,
       core::Signal& out_signal) override;
 
@@ -125,7 +128,7 @@ class BlitSdma : public BlitSdmaBase {
 
   virtual hsa_status_t EnableProfiling(bool enable) override;
 
- protected:
+ private:
   /// @brief Acquires the address into queue buffer where a new command
   /// packet of specified size could be written. The address that is
   /// returned is guaranteed to be unique even in a multi-threaded access
@@ -170,6 +173,9 @@ class BlitSdma : public BlitSdmaBase {
   void BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
                          uint32_t fence_value);
 
+  /// @brief Build Hdp Flush command
+  void BuildHdpFlushCommand(char* cmd_addr);
+
   uint32_t* ObtainFenceObject();
 
   void WaitFence(uint32_t* fence, uint32_t fence_value);
@@ -204,19 +210,25 @@ class BlitSdma : public BlitSdmaBase {
   RingIndexTy cached_reserve_index_;
   RingIndexTy cached_commit_index_;
 
-  uint32_t linear_copy_command_size_;
+  static const uint32_t linear_copy_command_size_;
+
+  static const uint32_t fill_command_size_;
+
+  static const uint32_t fence_command_size_;
 
-  uint32_t fill_command_size_;
+  static const uint32_t poll_command_size_;
 
-  uint32_t fence_command_size_;
+  static const uint32_t flush_command_size_;
 
-  uint32_t poll_command_size_;
+  static const uint32_t atomic_command_size_;
 
-  uint32_t atomic_command_size_;
+  static const uint32_t timestamp_command_size_;
 
-  uint32_t timestamp_command_size_;
+  static const uint32_t trap_command_size_;
 
-  uint32_t trap_command_size_;
+  // Flag to indicate if sDMA queue is used for H2D copy operations
+  // true if used for H2D operations, false otherwise
+  const bool sdma_h2d_;
 
   // Max copy size of a single linear copy command packet.
   size_t max_single_linear_copy_size_;
@@ -232,19 +244,20 @@ class BlitSdma : public BlitSdmaBase {
 
   /// True if platform atomic is supported.
   bool platform_atomic_support_;
+
+  /// True if sDMA supports HDP flush
+  bool hdp_flush_support_;
 };
 
-class BlitSdmaV2V3
-    // Ring indices are 32-bit.
-    // HW ring indices are not monotonic (wrap at end of ring).
-    // Count fields of SDMA commands are 0-based.
-    : public BlitSdma<uint32_t, false, 0> {};
-
-class BlitSdmaV4
-    // Ring indices are 64-bit.
-    // HW ring indices are monotonic (do not wrap at end of ring).
-    // Count fields of SDMA commands are 1-based.
-    : public BlitSdma<uint64_t, true, -1> {};
+// Ring indices are 32-bit.
+// HW ring indices are not monotonic (wrap at end of ring).
+// Count fields of SDMA commands are 0-based.
+typedef BlitSdma<uint32_t, false, 0> BlitSdmaV2V3;
+
+// Ring indices are 64-bit.
+// HW ring indices are monotonic (do not wrap at end of ring).
+// Count fields of SDMA commands are 1-based.
+typedef BlitSdma<uint64_t, true, -1>  BlitSdmaV4;
 
 }  // namespace amd
 
diff --git a/src/core/inc/amd_elf_image.hpp b/src/core/inc/amd_elf_image.hpp
index 763c5c831..6667c3b36 100644
--- a/src/core/inc/amd_elf_image.hpp
+++ b/src/core/inc/amd_elf_image.hpp
@@ -207,6 +207,7 @@ namespace amd {
 
       virtual uint16_t Machine() = 0;
       virtual uint16_t Type() = 0;
+      virtual uint32_t EFlags() = 0;
 
       std::string output() { return out.str(); }
 
diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
index 58a70a2cb..ff8fb193e 100644
--- a/src/core/inc/amd_gpu_agent.h
+++ b/src/core/inc/amd_gpu_agent.h
@@ -56,6 +56,7 @@
 #include "core/inc/cache.h"
 #include "core/util/small_heap.h"
 #include "core/util/locks.h"
+#include "core/util/lazy_ptr.h"
 
 namespace amd {
 class MemoryRegion;
@@ -66,6 +67,8 @@ struct ScratchInfo {
   size_t size;
   size_t size_per_thread;
   ptrdiff_t queue_process_offset;
+  bool large;
+  bool retry;
 };
 
 // @brief Interface to represent a GPU agent.
@@ -75,10 +78,8 @@ class GpuAgentInt : public core::Agent {
   GpuAgentInt(uint32_t node_id)
       : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
 
-  // @brief Initialize DMA queue.
-  //
-  // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
-  virtual void InitDma() = 0;
+  // @brief Ensure blits are ready (performance hint).
+  virtual void PreloadBlits() {}
 
   // @brief Initialization hook invoked after tools library has loaded,
   // to allow tools interception of interface functions.
@@ -104,15 +105,15 @@ class GpuAgentInt : public core::Agent {
 
   // @brief Carve scratch memory from scratch pool.
   //
-  // @param [out] scratch Structure to be populated with the carved memory
+  // @param [in/out] scratch Structure to be populated with the carved memory
   // information.
   virtual void AcquireQueueScratch(ScratchInfo& scratch) = 0;
 
   // @brief Release scratch memory back to scratch pool.
   //
-  // @param [in] base Address of scratch memory previously acquired with
-  // call to ::AcquireQueueScratch.
-  virtual void ReleaseQueueScratch(void* base) = 0;
+  // @param [in/out] scratch Scratch memory previously acquired with call to
+  // ::AcquireQueueScratch.
+  virtual void ReleaseQueueScratch(ScratchInfo& base) = 0;
 
   // @brief Translate the kernel start and end dispatch timestamp from agent
   // domain to host domain.
@@ -185,14 +186,16 @@ class GpuAgent : public GpuAgentInt {
   // @brief GPU agent destructor.
   ~GpuAgent();
 
-  // @brief Override from core::Agent.
-  void InitDma() override;
+  // @brief Ensure blits are ready (performance hint).
+  void PreloadBlits() override;
 
   // @brief Override from core::Agent.
   hsa_status_t PostToolsInit() override;
 
   uint16_t GetMicrocodeVersion() const;
 
+  uint16_t GetSdmaMicrocodeVersion() const;
+
   // @brief Assembles SP3 shader source into ISA or AQL code object.
   //
   // @param [in] src_sp3 SP3 shader source text representation.
@@ -256,7 +259,20 @@ class GpuAgent : public GpuAgentInt {
   void AcquireQueueScratch(ScratchInfo& scratch) override;
 
   // @brief Override from amd::GpuAgentInt.
-  void ReleaseQueueScratch(void* base) override;
+  void ReleaseQueueScratch(ScratchInfo& scratch) override;
+
+  // @brief Register signal for notification when scratch may become available.
+  // @p signal is notified by OR'ing with @p value.
+  void AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) {
+    ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+    scratch_notifiers_[signal] = value;
+  }
+
+  // @brief Deregister scratch notification signal.
+  void RemoveScratchNotifier(hsa_signal_t signal) {
+    ScopedAcquire<KernelMutex> lock(&scratch_lock_);
+    scratch_notifiers_.erase(signal);
+  }
 
   // @brief Override from amd::GpuAgentInt.
   void TranslateTime(core::Signal* signal,
@@ -326,7 +342,7 @@ class GpuAgent : public GpuAgentInt {
   // @brief Create SDMA blit object.
   //
   // @retval NULL if SDMA blit creation and initialization failed.
-  core::Blit* CreateBlitSdma();
+  core::Blit* CreateBlitSdma(bool h2d);
 
   // @brief Create Kernel blit object using provided compute queue.
   //
@@ -367,6 +383,12 @@ class GpuAgent : public GpuAgentInt {
   // @brief Object to manage scratch memory.
   SmallHeap scratch_pool_;
 
+  // @brief Current short duration scratch memory size.
+  size_t scratch_used_large_;
+
+  // @brief Notifications for scratch release.
+  std::map<hsa_signal_t, hsa_signal_value_t> scratch_notifiers_;
+
   // @brief Default scratch size per queue.
   size_t queue_scratch_len_;
 
@@ -376,7 +398,7 @@ class GpuAgent : public GpuAgentInt {
   // @brief Blit interfaces for each data path.
   enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
 
-  core::Blit* blits_[BlitCount];
+  lazy_ptr<core::Blit> blits_[BlitCount];
 
   // @brief AQL queues for cache management and blit compute usage.
   enum QueueEnum {
@@ -385,7 +407,7 @@ class GpuAgent : public GpuAgentInt {
     QueueCount
   };
 
-  core::Queue* queues_[QueueCount];
+  lazy_ptr<core::Queue> queues_[QueueCount];
 
   // @brief Mutex to protect the update to coherency type.
   KernelMutex coherency_lock_;
@@ -443,6 +465,9 @@ class GpuAgent : public GpuAgentInt {
   // @brief Query the driver to get the cache properties.
   void InitCacheList();
 
+  // @brief Create internal queues and blits.
+  void InitDma();
+
   // @brief Initialize memory pool for end timestamp object.
   // @retval True if the memory pool for end timestamp object is initialized.
   bool InitEndTsPool();
@@ -453,9 +478,6 @@ class GpuAgent : public GpuAgentInt {
   // @brief Alternative aperture size. Only on KV.
   size_t ape1_size_;
 
-  // @brief True if blit objects are initialized.
-  std::atomic<bool> blit_initialized_;
-
   // Each end ts is 32 bytes.
   static const size_t kTsSize = 32;
 
diff --git a/src/core/inc/amd_hsa_code.hpp b/src/core/inc/amd_hsa_code.hpp
index a20207914..4d79660e9 100644
--- a/src/core/inc/amd_hsa_code.hpp
+++ b/src/core/inc/amd_hsa_code.hpp
@@ -267,6 +267,7 @@ namespace code {
       const amd::elf::Section* HsaText() const { assert(hsatext); return hsatext; }
       amd::elf::SymbolTable* Symtab() { assert(img); return img->symtab(); }
       uint16_t Machine() const { return img->Machine(); }
+      uint32_t EFlags() const { return img->EFlags(); }
 
       AmdHsaCode(bool combineDataSegments = true);
       virtual ~AmdHsaCode();
diff --git a/src/core/inc/blit.h b/src/core/inc/blit.h
index 48aebaa64..640551a2f 100644
--- a/src/core/inc/blit.h
+++ b/src/core/inc/blit.h
@@ -75,24 +75,26 @@ class Blit {
   /// control block. The call is blocking until the command execution is
   /// finished.
   ///
+  /// @param p2p true if it is a peer-to-peer copy
   /// @param dst Memory address of the copy destination.
   /// @param src Memory address of the copy source.
   /// @param size Size of the data to be copied.
-  virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src,
-                                               size_t size) = 0;
+  virtual hsa_status_t SubmitLinearCopyCommand(bool p2p, void* dst,
+                                               const void* src, size_t size) = 0;
 
   /// @brief Submit a linear copy command to the the underlying compute device's
   /// control block. The call is non blocking. The memory transfer will start
   /// after all dependent signals are satisfied. After the transfer is
   /// completed, the out signal will be decremented.
   ///
+  /// @param p2p true if it is a peer-to-peer copy
   /// @param dst Memory address of the copy destination.
   /// @param src Memory address of the copy source.
   /// @param size Size of the data to be copied.
   /// @param dep_signals Arrays of dependent signal.
   /// @param out_signal Output signal.
   virtual hsa_status_t SubmitLinearCopyCommand(
-      void* dst, const void* src, size_t size,
+      bool p2p, void* dst, const void* src, size_t size,
       std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) = 0;
 
   /// @brief Submit a linear fill command to the the underlying compute device's
@@ -113,6 +115,9 @@ class Blit {
   /// @return HSA_STATUS_SUCCESS if the request to enable/disable profiling is
   /// successful.
   virtual hsa_status_t EnableProfiling(bool enable) = 0;
+
+  /// @brief Blit operations use SDMA.
+  virtual bool isSDMA() const { return false; }
 };
 }  // namespace core
 
diff --git a/src/core/inc/intercept_queue.h b/src/core/inc/intercept_queue.h
index 82ec222fe..3f1cd956e 100644
--- a/src/core/inc/intercept_queue.h
+++ b/src/core/inc/intercept_queue.h
@@ -183,7 +183,7 @@ class QueueProxy : public QueueWrapper {
 // @brief Provides packet intercept and rewrite capability for a queue.
 // Host-side dispatches are processed during doorbell ring.
 // Device-side dispatches are processed as an asynchronous signal event.
-class InterceptQueue : public QueueProxy, private LocalSignal, public Signal {
+class InterceptQueue : public QueueProxy, private LocalSignal, public DoorbellSignal {
  public:
   explicit InterceptQueue(std::unique_ptr<Queue> queue);
   ~InterceptQueue();
@@ -250,152 +250,6 @@ class InterceptQueue : public QueueProxy, private LocalSignal, public Signal {
     StoreRelaxed(value);
   }
 
-  /// @brief This operation is illegal
-  hsa_signal_value_t LoadRelaxed() {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t LoadAcquire() {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value,
-                                 uint64_t timeout, hsa_wait_state_t wait_hint) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value,
-                                 uint64_t timeout, hsa_wait_state_t wait_hint) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  void AndRelaxed(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AndAcquire(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AndRelease(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AndAcqRel(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrRelaxed(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrAcquire(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrRelease(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void OrAcqRel(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorRelaxed(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorAcquire(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorRelease(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void XorAcqRel(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddRelaxed(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddAcquire(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddRelease(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void AddAcqRel(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubRelaxed(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubAcquire(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubRelease(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  void SubAcqRel(hsa_signal_value_t value) { assert(false); }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchRelease(hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasRelease(hsa_signal_value_t expected, hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, hsa_signal_value_t value) {
-    assert(false);
-    return 0;
-  }
-
-  /// @brief This operation is illegal
-  hsa_signal_value_t* ValueLocation() const {
-    assert(false);
-    return NULL;
-  }
-
-  /// @brief This operation is illegal
-  HsaEvent* EopEvent() {
-    assert(false);
-    return NULL;
-  }
-
   static __forceinline bool IsType(core::Signal* signal) { return signal->IsType(&rtti_id_); }
   static __forceinline bool IsType(core::Queue* queue) { return queue->IsType(&rtti_id_); }
 
diff --git a/src/core/inc/isa.h b/src/core/inc/isa.h
index f2c224f2f..1dd6576cc 100644
--- a/src/core/inc/isa.h
+++ b/src/core/inc/isa.h
@@ -106,18 +106,30 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> {
   const Version &version() const {
     return version_;
   }
+  /// @returns True if this Isa has xnack enabled, false otherwise.
+  const bool &xnackEnabled() const {
+    return xnackEnabled_;
+  }
   /// @returns This Isa's supported wavefront.
   const Wavefront &wavefront() const {
     return wavefront_;
   }
 
+  /// @returns This Isa's architecture.
+  std::string GetArchitecture() const {
+    return "amdgcn";
+  }
   /// @returns This Isa's vendor.
   std::string GetVendor() const {
-    return "AMD";
+    return "amd";
   }
-  /// @returns This Isa's architecture.
-  std::string GetArchitecture() const {
-    return "AMDGPU";
+  /// @returns This Isa's OS.
+  std::string GetOS() const {
+    return "amdhsa";
+  }
+  /// @returns This Isa's environment.
+  std::string GetEnvironment() const {
+    return "";
   }
   /// @returns This Isa's major version.
   int32_t GetMajorVersion() const {
@@ -140,7 +152,8 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> {
   /// otherwise.
   bool IsCompatible(const Isa *isa_object) const {
     assert(isa_object);
-    return version_ == isa_object->version_;
+    return version_ == isa_object->version_ &&
+           xnackEnabled_ == isa_object->xnackEnabled_;
   }
   /// @returns True if this Isa is compatible with @p isa_handle, false
   /// otherwise.
@@ -168,14 +181,20 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> {
 
  private:
   /// @brief Default constructor.
-  Isa(): version_(Version(-1, -1, -1)) {}
+  Isa(): version_(Version(-1, -1, -1)), xnackEnabled_(false) {}
 
   /// @brief Construct from @p version.
-  Isa(const Version &version): version_(version) {}
+  Isa(const Version &version): version_(version), xnackEnabled_(false) {}
+
+  /// @brief Construct from @p version.
+  Isa(const Version &version, const bool xnack): version_(version), xnackEnabled_(xnack) {}
 
   /// @brief Isa's version.
   Version version_;
 
+  /// @brief Isa's supported xnack flag.
+  bool xnackEnabled_;
+
   /// @brief Isa's supported wavefront.
   Wavefront wavefront_;
 
@@ -190,7 +209,7 @@ class IsaRegistry final {
   /// @returns Isa for requested @p full_name, null pointer if not supported.
   static const Isa *GetIsa(const std::string &full_name);
   /// @returns Isa for requested @p version, null pointer if not supported.
-  static const Isa *GetIsa(const Isa::Version &version);
+  static const Isa *GetIsa(const Isa::Version &version, bool xnack);
 
  private:
   /// @brief IsaRegistry's map type.
diff --git a/src/core/inc/queue.h b/src/core/inc/queue.h
index d5fc23209..811206457 100644
--- a/src/core/inc/queue.h
+++ b/src/core/inc/queue.h
@@ -75,8 +75,7 @@ struct AqlPacket {
 
   std::string string() const {
     std::stringstream string;
-    uint8_t type = ((dispatch.header >> HSA_PACKET_HEADER_TYPE) &
-                    ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1));
+    uint8_t type = this->type();
 
     const char* type_names[] = {
         "HSA_PACKET_TYPE_VENDOR_SPECIFIC", "HSA_PACKET_TYPE_INVALID",
@@ -318,10 +317,11 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
   virtual void do_set_public_handle(hsa_queue_t* handle) {
     public_handle_ = handle;
   }
-  hsa_queue_t* public_handle_;
 
   virtual bool _IsA(rtti_t id) const = 0;
 
+  hsa_queue_t* public_handle_;
+
  private:
   DISALLOW_COPY_AND_ASSIGN(Queue);
 };
diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
index ac0d4e8d3..d1aed4e26 100644
--- a/src/core/inc/runtime.h
+++ b/src/core/inc/runtime.h
@@ -510,7 +510,7 @@ class Runtime {
   void* vm_fault_handler_user_data_;
 
   // Holds reference count to runtime object.
-  volatile uint32_t ref_count_;
+  std::atomic<uint32_t> ref_count_;
 
   // Track environment variables.
   Flag flag_;
diff --git a/src/core/inc/signal.h b/src/core/inc/signal.h
index ddc31ebd7..401b2c66e 100644
--- a/src/core/inc/signal.h
+++ b/src/core/inc/signal.h
@@ -60,6 +60,18 @@
 
 #include "inc/amd_hsa_signal.h"
 
+// Allow hsa_signal_t to be keys in STL structures.
+namespace std {
+template <> struct less<hsa_signal_t> {
+  __forceinline bool operator()(const hsa_signal_t& x, const hsa_signal_t& y) const {
+    return x.handle < y.handle;
+  }
+  typedef hsa_signal_t first_argument_type;
+  typedef hsa_signal_t second_argument_type;
+  typedef bool result_type;
+};
+}
+
 namespace core {
 class Agent;
 class Signal;
@@ -333,6 +345,166 @@ class Signal {
   DISALLOW_COPY_AND_ASSIGN(Signal);
 };
 
+/// @brief Handle signal operations which are not for use on doorbells.
+class DoorbellSignal : public Signal {
+ public:
+  using Signal::Signal;
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t LoadRelaxed() final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t LoadAcquire() final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, hsa_signal_value_t compare_value,
+                                 uint64_t timeout, hsa_wait_state_t wait_hint) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, hsa_signal_value_t compare_value,
+                                 uint64_t timeout, hsa_wait_state_t wait_hint) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  void AndRelaxed(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AndAcquire(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AndRelease(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AndAcqRel(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void OrRelaxed(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void OrAcquire(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void OrRelease(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void OrAcqRel(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void XorRelaxed(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void XorAcquire(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void XorRelease(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void XorAcqRel(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AddRelaxed(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AddAcquire(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AddRelease(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void AddAcqRel(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void SubRelaxed(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void SubAcquire(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void SubRelease(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  void SubAcqRel(hsa_signal_value_t value) final override { assert(false); }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t ExchRelease(hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected,
+                                hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t CasAcquire(hsa_signal_value_t expected,
+                                hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t CasRelease(hsa_signal_value_t expected,
+                                hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected,
+                               hsa_signal_value_t value) final override {
+    assert(false);
+    return 0;
+  }
+
+  /// @brief This operation is illegal
+  hsa_signal_value_t* ValueLocation() const final override {
+    assert(false);
+    return NULL;
+  }
+
+  /// @brief This operation is illegal
+  HsaEvent* EopEvent() final override {
+    assert(false);
+    return NULL;
+  }
+
+ protected:
+  /// @brief Disallow destroying doorbell apart from its queue.
+  void doDestroySignal() final override { assert(false); }
+};
+
 struct hsa_signal_handle {
   hsa_signal_t signal;
 
diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
index ab8fee67a..cac25fcf4 100644
--- a/src/core/runtime/amd_aql_queue.cpp
+++ b/src/core/runtime/amd_aql_queue.cpp
@@ -72,7 +72,7 @@ namespace amd {
 // Queue::amd_queue_ is cache-aligned for performance.
 const uint32_t kAmdQueueAlignBytes = 0x40;
 
-HsaEvent* AqlQueue::queue_event_ = NULL;
+HsaEvent* AqlQueue::queue_event_ = nullptr;
 std::atomic<uint32_t> AqlQueue::queue_count_(0);
 KernelMutex AqlQueue::queue_lock_;
 int AqlQueue::rtti_id_ = 0;
@@ -81,7 +81,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
                    core::HsaEventCallback callback, void* err_data, bool is_kv)
     : Queue(),
       LocalSignal(0),
-      Signal(signal()),
+      DoorbellSignal(signal()),
       ring_buf_(nullptr),
       ring_buf_alloc_bytes_(0),
       queue_id_(HSA_QUEUEID(-1)),
@@ -92,7 +92,8 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
       errors_data_(err_data),
       is_kv_queue_(is_kv),
       pm4_ib_buf_(nullptr),
-      pm4_ib_size_b_(0x1000) {
+      pm4_ib_size_b_(0x1000),
+      dynamicScratchState(0) {
   // When queue_full_workaround_ is set to 1, the ring buffer is internally
   // doubled in size. Virtual addresses in the upper half of the ring allocation
   // are mapped to the same set of pages backing the lower half.
@@ -279,17 +280,26 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
 }
 
 AqlQueue::~AqlQueue() {
-  Inactivate();
+  // Remove error handler synchronously.
+  // Sequences error handler callbacks with queue destroy.
+  dynamicScratchState |= ERROR_HANDLER_TERMINATE;
+  HSA::hsa_signal_store_screlease(amd_queue_.queue_inactive_signal, 0x8000000000000000ull);
+  while ((dynamicScratchState & ERROR_HANDLER_DONE) != ERROR_HANDLER_DONE) {
+    HSA::hsa_signal_wait_relaxed(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE,
+                                 0x8000000000000000ull, -1ull, HSA_WAIT_STATE_BLOCKED);
+    HSA::hsa_signal_store_relaxed(amd_queue_.queue_inactive_signal, 0x8000000000000000ull);
+  }
 
+  Inactivate();
+  agent_->ReleaseQueueScratch(queue_scratch_);
   FreeRegisteredRingBuffer();
-  agent_->ReleaseQueueScratch(queue_scratch_.queue_base);
   HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal);
   if (core::g_use_interrupt_wait) {
     ScopedAcquire<KernelMutex> lock(&queue_lock_);
     queue_count_--;
     if (queue_count_ == 0) {
       core::InterruptSignal::DestroyEvent(queue_event_);
-      queue_event_ = NULL;
+      queue_event_ = nullptr;
     }
   }
   core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
@@ -471,7 +481,7 @@ uint32_t AqlQueue::ComputeRingBufferMaxPkts() {
 }
 
 void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) {
-  if (agent_->profile() == HSA_PROFILE_FULL) {
+  if ((agent_->profile() == HSA_PROFILE_FULL) && queue_full_workaround_) {
     // Compute the physical and virtual size of the queue.
     uint32_t ring_buf_phys_size_bytes =
         uint32_t(queue_size_pkts * sizeof(core::AqlPacket));
@@ -602,23 +612,22 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) {
   } else {
     // Allocate storage for the ring buffer.
     ring_buf_alloc_bytes_ = AlignUp(
-        queue_size_pkts * static_cast<uint32_t>(sizeof(core::AqlPacket)), 4096);
+        queue_size_pkts * sizeof(core::AqlPacket), 4096);
 
     ring_buf_ = core::Runtime::runtime_singleton_->system_allocator()(
-        ring_buf_alloc_bytes_, 0x1000,
-        core::MemoryRegion::AllocateExecutable |
-            core::MemoryRegion::AllocateDoubleMap);
+        ring_buf_alloc_bytes_, 0x1000, core::MemoryRegion::AllocateExecutable |
+            (queue_full_workaround_ ? core::MemoryRegion::AllocateDoubleMap : 0));
 
     assert(ring_buf_ != NULL && "AQL queue memory allocation failure");
 
     // The virtual ring allocation is twice as large as requested.
     // Each half maps to the same set of physical pages.
-    ring_buf_alloc_bytes_ *= 2;
+    if (queue_full_workaround_) ring_buf_alloc_bytes_ *= 2;
   }
 }
 
 void AqlQueue::FreeRegisteredRingBuffer() {
-  if (agent_->profile() == HSA_PROFILE_FULL) {
+  if ((agent_->profile() == HSA_PROFILE_FULL) && queue_full_workaround_) {
 #ifdef __linux__
     munmap(ring_buf_, ring_buf_alloc_bytes_);
 #endif
@@ -676,110 +685,166 @@ int AqlQueue::CreateRingBufferFD(const char* ring_buf_shm_path,
 #endif
 }
 
+void AqlQueue::Suspend() {
+  auto err = hsaKmtUpdateQueue(queue_id_, 0, HSA_QUEUE_PRIORITY_NORMAL, NULL, 0, NULL);
+  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtUpdateQueue failed.");
+}
+
 hsa_status_t AqlQueue::Inactivate() {
   bool active = active_.exchange(false, std::memory_order_relaxed);
   if (active) {
-    auto err = hsaKmtDestroyQueue(this->queue_id_);
+    auto err = hsaKmtDestroyQueue(queue_id_);
     assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtDestroyQueue failed.");
+    atomic::Fence(std::memory_order_acquire);
   }
   return HSA_STATUS_SUCCESS;
 }
 
 bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
   AqlQueue* queue = (AqlQueue*)arg;
+  hsa_status_t errorCode = HSA_STATUS_SUCCESS;
+  bool fatal = false;
+  bool changeWait = false;
+  hsa_signal_value_t waitVal;
+
+  if ((queue->dynamicScratchState & ERROR_HANDLER_SCRATCH_RETRY) == ERROR_HANDLER_SCRATCH_RETRY) {
+    queue->dynamicScratchState &= ~ERROR_HANDLER_SCRATCH_RETRY;
+    queue->agent_->RemoveScratchNotifier(queue->amd_queue_.queue_inactive_signal);
+    changeWait = true;
+    waitVal = 0;
+    HSA::hsa_signal_and_relaxed(queue->amd_queue_.queue_inactive_signal, ~0x8000000000000000ull);
+    error_code &= ~0x8000000000000000ull;
+  }
 
-  if ((error_code & 1) == 1) {
-    // Insufficient scratch - recoverable
-    auto& scratch = queue->queue_scratch_;
+  // Process errors only if queue is not terminating.
+  if ((queue->dynamicScratchState & ERROR_HANDLER_TERMINATE) != ERROR_HANDLER_TERMINATE) {
+    if (error_code == 512) {  // Large scratch reclaim
+      auto& scratch = queue->queue_scratch_;
+      queue->agent_->ReleaseQueueScratch(scratch);
+      scratch.queue_base = nullptr;
+      scratch.size = 0;
+      scratch.size_per_thread = 0;
+      scratch.queue_process_offset = 0;
+      queue->InitScratchSRD();
+
+      HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0);
+      // Resumes queue processing.
+      atomic::Store(&queue->amd_queue_.queue_properties,
+                    queue->amd_queue_.queue_properties & (~AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE),
+                    std::memory_order_release);
+      atomic::Fence(std::memory_order_release);
+      return true;
+    }
+
+    // Process only one queue error.
+    if (error_code == 1) {
+      // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
+      auto& scratch = queue->queue_scratch_;
+
+      queue->agent_->ReleaseQueueScratch(scratch);
+
+      uint64_t pkt_slot_idx =
+          queue->amd_queue_.read_dispatch_id & (queue->amd_queue_.hsa_queue.size - 1);
 
-    queue->agent_->ReleaseQueueScratch(scratch.queue_base);
+      core::AqlPacket& pkt =
+          ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
 
-    uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size;
+      uint32_t scratch_request = pkt.dispatch.private_segment_size;
 
-    const core::AqlPacket& pkt =
-        ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
+      scratch.size_per_thread = scratch_request;
+      // Align whole waves to 1KB.
+      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
+      scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
+          queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
 
-    uint32_t scratch_request = pkt.dispatch.private_segment_size;
+      queue->agent_->AcquireQueueScratch(scratch);
 
-    scratch.size_per_thread =
-        Max(uint32_t(scratch.size_per_thread * 2), scratch_request);
-    // Align whole waves to 1KB.
-    scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
-    scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
-        queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
+      if (scratch.retry) {
+        queue->agent_->AddScratchNotifier(queue->amd_queue_.queue_inactive_signal,
+                                          0x8000000000000000ull);
+        queue->dynamicScratchState |= ERROR_HANDLER_SCRATCH_RETRY;
+        changeWait = true;
+        waitVal = error_code;
+      } else {
+        // Out of scratch - promote error
+        if (scratch.queue_base == nullptr) {
+          errorCode = HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+        } else {
+          // Mark large scratch allocation for single use.
+          if (scratch.large) {
+            queue->amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE;
+            // Set system release fence to flush scratch stores with older firmware versions.
+            if ((queue->agent_->isa()->GetMajorVersion() == 8) &&
+                (queue->agent_->GetMicrocodeVersion() < 729)) {
+              pkt.dispatch.header &= ~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1)
+                                       << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+              pkt.dispatch.header |=
+                  (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+            }
+          }
+          // Reset scratch memory related entities for the queue
+          queue->InitScratchSRD();
+          // Restart the queue.
+          HSA::hsa_signal_store_screlease(queue->amd_queue_.queue_inactive_signal, 0);
+        }
+      }
 
-    queue->agent_->AcquireQueueScratch(scratch);
-    if (scratch.queue_base == NULL) {
-      // Out of scratch - promote error and invalidate queue
-      queue->Inactivate();
-      if (queue->errors_callback_ != NULL)
-        queue->errors_callback_(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
-                                queue->public_handle(), queue->errors_data_);
-      return false;
+    } else if ((error_code & 2) == 2) {  // Invalid dim
+      errorCode = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
+
+    } else if ((error_code & 4) == 4) {  // Invalid group memory
+      errorCode = HSA_STATUS_ERROR_INVALID_ALLOCATION;
+
+    } else if ((error_code & 8) == 8) {  // Invalid (or NULL) code
+      errorCode = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+
+    } else if (((error_code & 32) == 32) ||    // Invalid format: 32 is generic,
+               ((error_code & 256) == 256)) {  // 256 is vendor specific packets
+      errorCode = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
+
+    } else if ((error_code & 64) == 64) {  // Group is too large
+      errorCode = HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+    } else if ((error_code & 128) == 128) {  // Out of VGPRs
+      errorCode = HSA_STATUS_ERROR_INVALID_ISA;
+
+    } else if ((error_code & 0x80000000) == 0x80000000) {  // Debug trap
+      errorCode = HSA_STATUS_ERROR_EXCEPTION;
+      fatal = true;
+
+    } else {  // Undefined code
+      assert(false && "Undefined queue error code");
+      errorCode = HSA_STATUS_ERROR;
+      fatal = true;
     }
 
-    // Reset scratch memory related entities for the queue
-    queue->InitScratchSRD();
-
-  } else if ((error_code & 2) == 2) {  // Invalid dim
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-
-  } else if ((error_code & 4) == 4) {  // Invalid group memory
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ALLOCATION,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-
-  } else if ((error_code & 8) == 8) {  // Invalid (or NULL) code
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_CODE_OBJECT,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-
-  } else if (((error_code & 32) == 32) ||
-             ((error_code & 256) == 256)) {  // Invalid format: 32 is generic,
-                                             // 256 is vendor specific packets
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_PACKET_FORMAT,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else if ((error_code & 64) == 64) {  // Group is too large
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ARGUMENT,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else if ((error_code & 128) == 128) {  // Out of VGPRs
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ISA,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else if ((error_code & 0x80000000) == 0x80000000) {  // Debug trap
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_EXCEPTION,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else {
-    // Undefined code
-    queue->Inactivate();
-    assert(false && "Undefined queue error code");
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR, queue->public_handle(),
-                              queue->errors_data_);
-    return false;
-  }
+    if (errorCode == HSA_STATUS_SUCCESS) {
+      if (changeWait) {
+        core::Runtime::runtime_singleton_->SetAsyncSignalHandler(
+            queue->amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, waitVal,
+            DynamicScratchHandler, queue);
+        return false;
+      }
+      return true;
+    }
 
-  HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0);
-  return true;
+    queue->Suspend();
+    if (queue->errors_callback_ != nullptr) {
+      queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_);
+    }
+    if (fatal) {
+      // Temporarilly removed until there is clarity on exactly what debugtrap's semantics are.
+      // assert(false && "Fatal queue error");
+      // std::abort();
+    }
+  }
+  // Copy here is to protect against queue being released between setting the scratch state and
+  // updating the signal value.  The signal itself is safe to use because it is ref counted rather
+  // than being released with the queue.
+  hsa_signal_t signal = queue->amd_queue_.queue_inactive_signal;
+  queue->dynamicScratchState = ERROR_HANDLER_DONE;
+  HSA::hsa_signal_store_screlease(signal, -1ull);
+  return false;
 }
 
 hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count,
@@ -800,7 +865,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
   // Obtain a queue slot for a single AQL packet.
   uint64_t write_idx = queue->AddWriteIndexAcqRel(1);
 
-  while ((write_idx - queue->LoadReadIndexRelaxed()) > queue->amd_queue_.hsa_queue.size) {
+  while ((write_idx - queue->LoadReadIndexRelaxed()) >= queue->amd_queue_.hsa_queue.size) {
     os::YieldThread();
   }
 
diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp
index b9941d69e..a4b77fa09 100644
--- a/src/core/runtime/amd_blit_kernel.cpp
+++ b/src/core/runtime/amd_blit_kernel.cpp
@@ -587,8 +587,8 @@ hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
-                                                 size_t size) {
+hsa_status_t BlitKernel::SubmitLinearCopyCommand(bool p2p, void* dst,
+                                                 const void* src, size_t size) {
   // Protect completion_signal_.
   std::lock_guard<std::mutex> guard(lock_);
 
@@ -597,7 +597,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
   std::vector<core::Signal*> dep_signals(0);
 
   hsa_status_t stat = SubmitLinearCopyCommand(
-      dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_));
+      p2p, dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_));
 
   if (stat != HSA_STATUS_SUCCESS) {
     return stat;
@@ -614,7 +614,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
 }
 
 hsa_status_t BlitKernel::SubmitLinearCopyCommand(
-    void* dst, const void* src, size_t size,
+    bool p2p, void* dst, const void* src, size_t size,
     std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
   // Reserve write index for barrier(s) + dispatch packet.
   const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5);
@@ -624,6 +624,7 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
   uint64_t write_index_temp = write_index;
 
   // Insert barrier packets to handle dependent signals.
+  // Barrier bit keeps signal checking traffic from competing with a copy.
   const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
       (1 << HSA_PACKET_HEADER_BARRIER) |
       (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
@@ -775,8 +776,7 @@ hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
 }
 
 hsa_status_t BlitKernel::EnableProfiling(bool enable) {
-  AMD_HSA_BITS_SET(queue_->amd_queue_.queue_properties,
-                   AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, enable);
+  queue_->SetProfiling(enable);
   return HSA_STATUS_SUCCESS;
 }
 
@@ -799,51 +799,6 @@ void BlitKernel::ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet) {
   doorbell->StoreRelease(write_index + num_packet - 1);
 }
 
-hsa_status_t BlitKernel::FenceRelease(uint64_t write_index,
-                                      uint32_t num_copy_packet,
-                                      hsa_fence_scope_t fence) {
-  // This function is not thread safe.
-
-  const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
-      (1 << HSA_PACKET_HEADER_BARRIER) |
-      (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
-      (fence << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-
-  hsa_barrier_and_packet_t packet = {0};
-  packet.header = kInvalidPacketHeader;
-
-  HSA::hsa_signal_store_relaxed(completion_signal_, 1);
-  packet.completion_signal = completion_signal_;
-
-  if (num_copy_packet == 0) {
-    assert(write_index == 0);
-    // Reserve write index.
-    write_index = AcquireWriteIndex(1);
-  }
-
-  // Populate queue buffer with AQL packet.
-  hsa_barrier_and_packet_t* queue_buffer =
-      reinterpret_cast<hsa_barrier_and_packet_t*>(
-          queue_->public_handle()->base_address);
-  std::atomic_thread_fence(std::memory_order_acquire);
-  queue_buffer[(write_index + num_copy_packet) & queue_bitmask_] = packet;
-  std::atomic_thread_fence(std::memory_order_release);
-  queue_buffer[(write_index + num_copy_packet) & queue_bitmask_].header =
-      kBarrierPacketHeader;
-
-  // Launch packet.
-  ReleaseWriteIndex(write_index, num_copy_packet + 1);
-
-  // Wait for the packet to finish.
-  if (HSA::hsa_signal_wait_scacquire(packet.completion_signal, HSA_SIGNAL_CONDITION_LT, 1,
-                                     uint64_t(-1), HSA_WAIT_STATE_ACTIVE) != 0) {
-    // Signal wait returned unexpected value.
-    return HSA_STATUS_ERROR;
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
 void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
                                uint32_t grid_size_x,
                                hsa_signal_t completion_signal) {
diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp
index 7daf92ba8..47218ec67 100644
--- a/src/core/runtime/amd_blit_sdma.cpp
+++ b/src/core/runtime/amd_blit_sdma.cpp
@@ -360,6 +360,20 @@ typedef struct SDMA_PKT_TRAP_TAG {
   } INT_CONTEXT_UNION;
 } SDMA_PKT_TRAP;
 
+// Initialize Hdp flush packet for use on sDMA of devices
+// from Gfx9 or new  family
+static const SDMA_PKT_POLL_REGMEM hdp_flush_cmd_ {
+                                        { SDMA_OP_POLL_REGMEM },
+                                        { 0x00 },
+                                        { 0x80000000 },
+                                        { 0x00 },
+                                        { 0x00 },
+                                        { 0x00 },
+};
+
+// Version of sDMA microcode supporting Hdp flush
+static const uint16_t sdma_version_ = 0x01A5;
+
 inline uint32_t ptrlow32(const void* p) {
   return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
 }
@@ -377,8 +391,33 @@ const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
 const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0;  // From HW documentation
 const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
 
+// Initialize size of various sDMA commands use by this module
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::fence_command_size_ = sizeof(SDMA_PKT_FENCE);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
+
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
-BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma(bool copy_direction)
     : agent_(NULL),
       queue_start_addr_(NULL),
       fence_base_addr_(NULL),
@@ -386,7 +425,9 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
       fence_pool_counter_(0),
       cached_reserve_index_(0),
       cached_commit_index_(0),
-      platform_atomic_support_(true) {
+      sdma_h2d_(copy_direction),
+      platform_atomic_support_(true),
+      hdp_flush_support_(false) {
   std::memset(&queue_resource_, 0, sizeof(queue_resource_));
 }
 
@@ -407,14 +448,6 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
     return HSA_STATUS_ERROR;
   }
 
-  linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR);
-  fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL);
-  fence_command_size_ = sizeof(SDMA_PKT_FENCE);
-  poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
-  atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
-  timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
-  trap_command_size_ = sizeof(SDMA_PKT_TRAP);
-
   const amd::GpuAgentInt& amd_gpu_agent =
       static_cast<const amd::GpuAgentInt&>(agent);
 
@@ -423,10 +456,16 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
     return HSA_STATUS_ERROR;
   }
 
-  if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
+  if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1) ||
+      amd_gpu_agent.isa()->GetMajorVersion() == 9) {
     platform_atomic_support_ = false;
   }
 
+  // Determine if sDMA microcode supports HDP flush command
+  if (agent_->GetSdmaMicrocodeVersion() >= sdma_version_) {
+    hdp_flush_support_ = true;
+  }
+
   // Allocate queue buffer.
   queue_start_addr_ = (char*)core::Runtime::runtime_singleton_->system_allocator()(
       kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable);
@@ -499,7 +538,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy
 
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
-    void* dst, const void* src, size_t size) {
+    bool p2p, void* dst, const void* src, size_t size) {
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
   const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
@@ -507,8 +546,16 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
   const uint32_t total_copy_command_size =
       num_copy_command * linear_copy_command_size_;
 
+  // Add space for acquire or release Hdp flush command
+  uint32_t flush_cmd_size = 0;
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (p2p)) {
+      flush_cmd_size = flush_command_size_;
+    }
+  }
+
   const uint32_t total_command_size =
-      total_copy_command_size + fence_command_size_;
+      total_copy_command_size + fence_command_size_ + flush_cmd_size;
 
   const uint32_t kFenceValue = 2015;
   uint32_t* fence_addr = ObtainFenceObject();
@@ -521,10 +568,25 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
-  BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
+  // Determine if a Hdp flush cmd is required at the top of cmd stream
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false) && (p2p)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
 
+  BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
   command_addr += total_copy_command_size;
 
+  // Determine if a Hdp flush cmd is required at the end of cmd stream
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_) && (p2p)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
+
   BuildFenceCommand(command_addr, fence_addr, kFenceValue);
 
   ReleaseWriteAddress(curr_index, total_command_size);
@@ -536,7 +598,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
 
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
-    void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
+    bool p2p, void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
     core::Signal& out_signal) {
   // The signal is 64 bit value, and poll checks for 32 bit value. So we
   // need to use two poll operations per dependent signal.
@@ -592,9 +654,17 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
           ? (fence_command_size_ + trap_command_size_)
           : 0;
 
+  // Add space for acquire or release Hdp flush command
+  uint32_t flush_cmd_size = 0;
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (p2p)) {
+      flush_cmd_size = flush_command_size_;
+    }
+  }
+
   const uint32_t total_command_size =
       total_poll_command_size + total_copy_command_size + sync_command_size +
-      total_timestamp_command_size + interrupt_command_size;
+      total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
 
   RingIndexTy curr_index;
   char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
@@ -620,11 +690,26 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
     command_addr += timestamp_command_size_;
   }
 
+  // Determine if a Hdp flush cmd is required at the top of cmd stream
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false) && (p2p)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
+
   // Do the transfer after all polls are satisfied.
   BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
-
   command_addr += total_copy_command_size;
 
+  // Determine if a Hdp flush cmd is required at the end of cmd stream
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_) && (p2p)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
+
   if (profiling_enabled) {
     assert(IsMultipleOf(end_ts_addr, 32));
     BuildGetGlobalTimestampCommand(command_addr,
@@ -684,8 +769,24 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
   const uint32_t total_fill_command_size =
       num_fill_command * fill_command_size_;
 
+  // Add space for acquire or release Hdp flush command
+  uint32_t flush_cmd_size = 0;
+
+  /*
+   * @note: Commenting this block of code. This is safe since this method
+   * is never entered. Runtime binds client requests to BlitKernels i.e.
+   * the Blit object being chosen is blit[dev-to-dev]
+   */
+  /*
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_)) {
+      flush_cmd_size = flush_command_size_;
+    }
+  }
+  */
+
   const uint32_t total_command_size =
-      total_fill_command_size + fence_command_size_;
+      total_fill_command_size + fence_command_size_ + flush_cmd_size;
 
   RingIndexTy curr_index;
   char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
@@ -723,6 +824,17 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
 
   assert(cur_size == size);
 
+  // Determine if a Hdp flush cmd is required at the end of cmd stream
+  // @note: Blocked per comments above lines - 776-778
+  /*
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
+  */
+
   const uint32_t kFenceValue = 2015;
   uint32_t* fence_addr = ObtainFenceObject();
   *fence_addr = 0;
@@ -1017,6 +1129,14 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildTrapComman
   packet_addr->HEADER_UNION.op = SDMA_OP_TRAP;
 }
 
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildHdpFlushCommand(
+    char* cmd_addr) {
+  assert(cmd_addr != NULL);
+  SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
+  memcpy(addr, &hdp_flush_cmd_, flush_command_size_);
+}
+
 template class BlitSdma<uint32_t, false, 0>;
 template class BlitSdma<uint64_t, true, -1>;
 
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index 78bede5d2..37a000be4 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -49,6 +49,8 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <memory>
+#include <utility>
 
 #include "core/inc/amd_aql_queue.h"
 #include "core/inc/amd_blit_kernel.h"
@@ -83,7 +85,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
       memory_max_frequency_(0),
       ape1_base_(0),
       ape1_size_(0),
-      blit_initialized_(false),
       end_ts_pool_size_(0),
       end_ts_pool_counter_(0),
       end_ts_base_addr_(NULL) {
@@ -97,7 +98,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
   // Set instruction set architecture via node property, only on GPU device.
   isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version(
       node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor,
-      node_props.EngineId.ui32.Stepping));
+      node_props.EngineId.ui32.Stepping), profile_ == HSA_PROFILE_FULL);
 
   // Check if the device is Kaveri, only on GPU device.
   if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 &&
@@ -131,17 +132,12 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
 
 GpuAgent::~GpuAgent() {
   for (int i = 0; i < BlitCount; ++i) {
-    if (blits_[i] != NULL) {
+    if (blits_[i] != nullptr) {
       hsa_status_t status = blits_[i]->Destroy(*this);
       assert(status == HSA_STATUS_SUCCESS);
-      delete blits_[i];
     }
   }
 
-  for (int i = 0; i < QueueCount; ++i) {
-    delete queues_[i];
-  }
-
   if (end_ts_base_addr_ != NULL) {
     core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
   }
@@ -336,7 +332,6 @@ void GpuAgent::InitScratchPool() {
   // scratch/thread
   const uint32_t num_cu =
       properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  queue_scratch_len_ = 0;
   queue_scratch_len_ = AlignUp(32 * 64 * num_cu * scratch_per_thread_, 65536);
   size_t max_scratch_len = queue_scratch_len_ * max_queues_;
 
@@ -358,7 +353,7 @@ void GpuAgent::InitScratchPool() {
   if (HSAKMT_STATUS_SUCCESS == err) {
     new (&scratch_pool_) SmallHeap(scratch_base, max_scratch_len);
   } else {
-    new (&scratch_pool_) SmallHeap(NULL, 0);
+    new (&scratch_pool_) SmallHeap();
   }
 }
 
@@ -521,13 +516,13 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
   return queue;
 }
 
-core::Blit* GpuAgent::CreateBlitSdma() {
+core::Blit* GpuAgent::CreateBlitSdma(bool h2d) {
   core::Blit* sdma;
 
   if (isa_->GetMajorVersion() <= 8) {
-    sdma = new BlitSdmaV2V3;
+    sdma = new BlitSdmaV2V3(h2d);
   } else {
-    sdma = new BlitSdmaV4;
+    sdma = new BlitSdmaV4(h2d);
   }
 
   if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
@@ -552,74 +547,97 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
 }
 
 void GpuAgent::InitDma() {
-  // This provides the ability to lazy init the blit objects on places that
-  // could give indication of DMA usage in the future. E.g.:
-  // 1. Call to allow access API.
-  // 2. Call to memory lock API.
-  if (!blit_initialized_.load(std::memory_order_acquire)) {
-    ScopedAcquire<KernelMutex> lock(&blit_lock_);
-    if (!blit_initialized_.load(std::memory_order_relaxed)) {
-      // Try create SDMA blit first.
-      // TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified.
-      if ((isa_->GetMajorVersion() != 8) &&
-          core::Runtime::runtime_singleton_->flag().enable_sdma() &&
-          (HSA_PROFILE_BASE == profile_)) {
-        blits_[BlitHostToDev] = CreateBlitSdma();
-        blits_[BlitDevToHost] = CreateBlitSdma();
-
-        if (blits_[BlitHostToDev] != NULL && blits_[BlitDevToHost] != NULL) {
-          blit_initialized_.store(true, std::memory_order_release);
-          return;
-        }
-      }
-
-      // Fall back to blit kernel if SDMA is unavailable.
-      if (blits_[BlitHostToDev] == NULL) {
-        // Create a dedicated compute queue for host-to-device blits.
-        queues_[QueueBlitOnly] = CreateInterceptibleQueue();
-        assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed");
+  // Setup lazy init pointers on queues and blits.
+  auto queue_lambda = [this]() {
+    auto ret = CreateInterceptibleQueue();
+    if (ret == nullptr)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
+                               "Internal queue creation failed.");
+    return ret;
+  };
+  // Dedicated compute queue for host-to-device blits.
+  queues_[QueueBlitOnly].reset(queue_lambda);
+  // Share utility queue with device-to-host blits.
+  queues_[QueueUtility].reset(queue_lambda);
+
+  // Decide which engine to use for blits.
+  auto blit_lambda = [this](bool h2d, lazy_ptr<core::Queue>& queue) {
+    std::string sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
+    bool use_sdma = (sdma_override.size() == 0) ? (isa_->GetMajorVersion() != 8) : (sdma_override == "1");
+
+    if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
+        auto ret = CreateBlitSdma(h2d);
+        if (ret != nullptr) return ret;
+    }
 
-        blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]);
-        assert(blits_[BlitHostToDev] != NULL && "Blit creation failed");
-      }
+    auto ret = CreateBlitKernel((*queue).get());
+    if (ret == nullptr)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
+    return ret;
+  };
 
-      if (blits_[BlitDevToHost] == NULL) {
-        // Share utility queue with device-to-host blits.
-        if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
-        blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]);
-        assert(blits_[BlitDevToHost] != NULL && "Blit creation failed");
-      }
+  blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueBlitOnly]); });
+  blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
+  blits_[BlitDevToDev].reset([this]() {
+    auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
+    if (ret == nullptr)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
+    return ret;
+  });
+}
 
-      blit_initialized_.store(true, std::memory_order_release);
-    }
-  }
+void GpuAgent::PreloadBlits() {
+  blits_[BlitHostToDev].touch();
+  blits_[BlitDevToHost].touch();
+  blits_[BlitDevToDev].touch();
 }
 
 hsa_status_t GpuAgent::PostToolsInit() {
   // Defer memory allocation until agents have been discovered.
   InitScratchPool();
   BindTrapHandler();
+  InitDma();
 
-  // Defer utility queue creation to allow tools to intercept.
-  if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
-
-  if (queues_[QueueUtility] == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
-  // Share utility queue with device-to-device blits.
-  if (blits_[BlitDevToDev] == nullptr)
-    blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]);
+  return HSA_STATUS_SUCCESS;
+}
 
-  if (blits_[BlitDevToDev] == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+struct DmaDeps_t {
+  bool p2p;
+  void* dst;
+  const void* src;
+  size_t size;
+  core::Signal* out_signal;
+  core::Blit* blit;
+  std::unique_ptr<std::vector<core::Signal*>> deps;
+};
+
+static bool DmaDeps(hsa_signal_value_t val, void* arg) {
+  DmaDeps_t* Args = (DmaDeps_t*)arg;
+  std::vector<core::Signal*>& deps = *(Args->deps.get());
+  if (val != 0) return true;
+  for (int i = deps.size() - 1; i != 0; i--) {
+    if (deps[i - 1]->LoadRelaxed() != 0) {
+      deps.resize(i);
+      hsa_status_t err = core::Runtime::runtime_singleton_->SetAsyncSignalHandler(
+          core::Signal::Convert(deps.back()), HSA_SIGNAL_CONDITION_EQ, 0, DmaDeps, arg);
+      assert(err == HSA_STATUS_SUCCESS && "Failed to update dependency handler.");
+      return false;
+    }
   }
-
-  return HSA_STATUS_SUCCESS;
+  deps.clear();
+  hsa_status_t stat;
+  do {  // Only ready to run copies are on the SDMA queue so if resources are busy they will soon be
+        // free.
+    stat = Args->blit->SubmitLinearCopyCommand(Args->p2p, Args->dst, Args->src, Args->size, deps,
+                                               *(Args->out_signal));
+  } while (stat != HSA_STATUS_SUCCESS);
+  delete Args;
+  return false;
 }
 
 hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
-  return blits_[BlitDevToDev]->SubmitLinearCopyCommand(dst, src, size);
+  // This operation is not a P2P operation - uses BlitKernel
+  return blits_[BlitDevToDev]->SubmitLinearCopyCommand(false, dst, src, size);
 }
 
 hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
@@ -627,26 +645,42 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                                size_t size,
                                std::vector<core::Signal*>& dep_signals,
                                core::Signal& out_signal) {
-  core::Blit* blit =
-      (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-       dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-          ? blits_[BlitHostToDev]
-          : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
-             dst_agent.device_type() == core::Agent::kAmdCpuDevice)
-                ? blits_[BlitDevToHost]
-                : blits_[BlitDevToDev];
-
-  if (blit == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
+  lazy_ptr<core::Blit>& blit =
+    (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+     dst_agent.device_type() == core::Agent::kAmdGpuDevice)
+       ? blits_[BlitHostToDev]
+       : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
+          dst_agent.device_type() == core::Agent::kAmdCpuDevice)
+            ? blits_[BlitDevToHost]
+            : (src_agent.node_id() == dst_agent.node_id())
+              ? blits_[BlitDevToDev] : blits_[BlitDevToHost];
 
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
     // domain correctly.
-    out_signal.async_copy_agent(this);
+    out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
   }
 
-  hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
+  // Determine if this is a Peer-To-Peer copy operation
+  bool p2p = ((src_agent.node_id() != dst_agent.node_id()) &&
+              (src_agent.device_type() == core::Agent::kAmdGpuDevice) &&
+              (dst_agent.device_type() == core::Agent::kAmdGpuDevice));
+
+  if ((dep_signals.size() != 0) && blit->isSDMA()) {
+    DmaDeps_t* Arg = new DmaDeps_t;
+    Arg->p2p = p2p;
+    Arg->dst = dst;
+    Arg->src = src;
+    Arg->size = size;
+    Arg->out_signal = &out_signal;
+    Arg->blit = (*blit).get();
+    Arg->deps.reset(new std::vector<core::Signal*>(std::move(dep_signals)));
+    hsa_status_t stat = core::Runtime::runtime_singleton_->SetAsyncSignalHandler(
+        core::Signal::Convert(Arg->deps->back()), HSA_SIGNAL_CONDITION_EQ, 0, DmaDeps, Arg);
+    return stat;
+  }
+
+  hsa_status_t stat = blit->SubmitLinearCopyCommand(p2p, dst, src, size, dep_signals, out_signal);
 
   return stat;
 }
@@ -915,8 +949,9 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
   const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
   scratch.size = scratch.size_per_thread * 32 * 64 * num_cu;
   scratch.queue_base = nullptr;
+  scratch.queue_process_offset = 0;
 
-  MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueScratch(scratch.queue_base); });
+  MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueScratch(scratch); });
 
   if (scratch.size != 0) {
     AcquireQueueScratch(scratch);
@@ -925,6 +960,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
     }
   }
 
+  // Ensure utility queue has been created.
+  // Deferring longer risks exhausting queue count before ISA upload and invalidation capability is
+  // ensured.
+  queues_[QueueUtility].touch();
+
   // Create an HW AQL queue
   *queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_);
   scratchGuard.Dismiss();
@@ -939,30 +979,50 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
     scratch.size_per_thread = scratch_per_thread_;
   }
 
+  scratch.retry = false;
+
   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-  scratch.queue_base = scratch_pool_.alloc(scratch.size);
+  // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
+  size_t small_limit = scratch_pool_.size() >> 3;
+  size_t single_limit = small_limit >> 2;
+  bool large = (scratch.size > single_limit) ||
+      (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
+  large = (isa_->GetMajorVersion() < 8) ? false : large;
+  if (large)
+    scratch.queue_base = scratch_pool_.alloc_high(scratch.size);
+  else
+    scratch.queue_base = scratch_pool_.alloc(scratch.size);
+  large |= scratch.queue_base > scratch_pool_.high_split();
+  scratch.large = large;
+
   scratch.queue_process_offset =
       (need_queue_scratch_base)
           ? uintptr_t(scratch.queue_base)
           : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
 
-  if (scratch.queue_base != NULL) {
+  if (scratch.queue_base != nullptr) {
     if (profile_ == HSA_PROFILE_FULL) return;
     if (profile_ == HSA_PROFILE_BASE) {
       HSAuint64 alternate_va;
-      if (HSAKMT_STATUS_SUCCESS ==
-          hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va))
+      if (hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va) ==
+          HSAKMT_STATUS_SUCCESS) {
+        if (large) scratch_used_large_ += scratch.size;
         return;
+      }
     }
   }
 
   // Scratch request failed allocation or mapping.
   scratch_pool_.free(scratch.queue_base);
-  scratch.queue_base = NULL;
+  scratch.queue_base = nullptr;
+
+  // Retry if large may yield needed space.
+  if (scratch_used_large_ != 0) {
+    scratch.retry = true;
+    return;
+  }
 
-// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
-// This is somewhat dangerous as it limits the number of concurrent waves from future dispatches
-// on the queue if those waves use even small amounts of scratch.
+  // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
   if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
     debug_print("Failed to map requested scratch - reducing queue occupancy.\n");
   uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
@@ -973,7 +1033,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
     size_t size = waves_per_cu * num_cus * size_per_wave;
     void* base = scratch_pool_.alloc(size);
     HSAuint64 alternate_va;
-    if ((base != NULL) &&
+    if ((base != nullptr) &&
         ((profile_ == HSA_PROFILE_FULL) ||
          (hsaKmtMapMemoryToGPU(base, size, &alternate_va) == HSAKMT_STATUS_SUCCESS))) {
       // Scratch allocated and either full profile or map succeeded.
@@ -983,6 +1043,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
           (need_queue_scratch_base)
               ? uintptr_t(scratch.queue_base)
               : uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base());
+      scratch.large = true;
+      scratch_used_large_ += scratch.size;
       return;
     }
     scratch_pool_.free(base);
@@ -990,23 +1052,29 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
   }
 
   // Failed to allocate minimal scratch
-  assert(scratch.queue_base == NULL && "bad scratch data");
+  assert(scratch.queue_base == nullptr && "bad scratch data");
   if (core::Runtime::runtime_singleton_->flag().enable_queue_fault_message())
     debug_print("Could not allocate scratch for one wave per CU.\n");
 }
 
-void GpuAgent::ReleaseQueueScratch(void* base) {
-  if (base == NULL) {
+void GpuAgent::ReleaseQueueScratch(ScratchInfo& scratch) {
+  if (scratch.queue_base == nullptr) {
     return;
   }
 
   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
   if (profile_ == HSA_PROFILE_BASE) {
-    if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) {
+    if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(scratch.queue_base)) {
       assert(false && "Unmap scratch subrange failed!");
     }
   }
-  scratch_pool_.free(base);
+  scratch_pool_.free(scratch.queue_base);
+
+  if (scratch.large) scratch_used_large_ -= scratch.size;
+
+  // Notify waiters that additional scratch may be available.
+  for (auto notifier : scratch_notifiers_)
+    HSA::hsa_signal_or_relaxed(notifier.first, notifier.second);
 }
 
 void GpuAgent::TranslateTime(core::Signal* signal,
@@ -1082,6 +1150,10 @@ uint16_t GpuAgent::GetMicrocodeVersion() const {
   return (properties_.EngineId.ui32.uCode);
 }
 
+uint16_t GpuAgent::GetSdmaMicrocodeVersion() const {
+  return (properties_.uCodeEngineVersions.uCodeSDMA);
+}
+
 void GpuAgent::SyncClocks() {
   HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t1_);
   assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");
diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
index 99fbdd10a..39814839e 100644
--- a/src/core/runtime/amd_memory_region.cpp
+++ b/src/core/runtime/amd_memory_region.cpp
@@ -136,8 +136,7 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owne
         (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize;
   }
 
-  max_single_alloc_size_ =
-      AlignDown(static_cast<size_t>(GetPhysicalSize()), kPageSize_);
+  max_single_alloc_size_ = AlignDown(static_cast<size_t>(GetPhysicalSize()), kPageSize_);
 
   mem_flag_.ui32.CoarseGrain = (fine_grain) ? 0 : 1;
 
@@ -288,16 +287,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
       }
       break;
     case HSA_REGION_INFO_SIZE:
-      switch (mem_props_.HeapType) {
-        case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
-        case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
-          *((size_t*)value) = static_cast<size_t>(GetPhysicalSize());
-          break;
-        default:
-          *((size_t*)value) = static_cast<size_t>(
-              (full_profile()) ? GetVirtualSize() : GetPhysicalSize());
-          break;
-      }
+      *((size_t*)value) = static_cast<size_t>(GetPhysicalSize());
       break;
     case HSA_REGION_INFO_ALLOC_MAX_SIZE:
       switch (mem_props_.HeapType) {
@@ -535,7 +525,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   lock.Release();
 
   for (GpuAgentInt* gpu : whitelist_gpus) {
-    gpu->InitDma();
+    gpu->PreloadBlits();
   }
 
   return HSA_STATUS_SUCCESS;
@@ -584,7 +574,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
 
       if (agent->device_type() == core::Agent::kAmdGpuDevice) {
         whitelist_nodes.push_back(agent->node_id());
-        whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent));
+        whitelist_gpus.insert(agent);
       }
     }
   }
@@ -607,8 +597,9 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
       } else {
         *agent_ptr = host_ptr;
       }
-      for (core::Agent* gpu : whitelist_gpus) {
-        reinterpret_cast<GpuAgentInt*>(gpu)->InitDma();
+
+      for (auto gpu : whitelist_gpus) {
+        static_cast<GpuAgentInt*>(gpu)->PreloadBlits();
       }
 
       return HSA_STATUS_SUCCESS;
diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp
index 0c55e3810..c33b78f13 100644
--- a/src/core/runtime/hsa.cpp
+++ b/src/core/runtime/hsa.cpp
@@ -376,7 +376,7 @@ static size_t get_extension_table_length(uint16_t extension, uint16_t major, uin
       return 0;
   }
 
-  char buff[3];
+  char buff[6];
   sprintf(buff, "%02u", minor);
   name += std::to_string(major) + "_" + buff + "_pfn_t";
 
@@ -1821,6 +1821,71 @@ hsa_status_t hsa_code_object_destroy(
   CATCH;
 }
 
+static std::string ConvertOldTargetNameToNew(
+    const std::string &OldName, bool IsFinalizer, uint32_t EFlags) {
+  std::string NewName = "";
+
+  // FIXME #1: Should 9:0:3 be completely (loader, sc, etc.) removed?
+  // FIXME #2: What does PAL do with respect to boltzmann/usual fiji/tonga?
+  if (OldName == "AMD:AMDGPU:7:0:0")
+    NewName = "amdgcn-amd-amdhsa--gfx700";
+  else if (OldName == "AMD:AMDGPU:7:0:1")
+    NewName = "amdgcn-amd-amdhsa--gfx701";
+  else if (OldName == "AMD:AMDGPU:7:0:2")
+    NewName = "amdgcn-amd-amdhsa--gfx702";
+  else if (OldName == "AMD:AMDGPU:7:0:3")
+    NewName = "amdgcn-amd-amdhsa--gfx703";
+  else if (OldName == "AMD:AMDGPU:7:0:4")
+    NewName = "amdgcn-amd-amdhsa--gfx704";
+  else if (OldName == "AMD:AMDGPU:8:0:0")
+    NewName = "amdgcn-amd-amdhsa--gfx800";
+  else if (OldName == "AMD:AMDGPU:8:0:1")
+    NewName = "amdgcn-amd-amdhsa--gfx801";
+  else if (OldName == "AMD:AMDGPU:8:0:2")
+    NewName = "amdgcn-amd-amdhsa--gfx802";
+  else if (OldName == "AMD:AMDGPU:8:0:3")
+    NewName = "amdgcn-amd-amdhsa--gfx803";
+  else if (OldName == "AMD:AMDGPU:8:0:4")
+    NewName = "amdgcn-amd-amdhsa--gfx804";
+  else if (OldName == "AMD:AMDGPU:8:1:0")
+    NewName = "amdgcn-amd-amdhsa--gfx810";
+  else if (OldName == "AMD:AMDGPU:9:0:0")
+    NewName = "amdgcn-amd-amdhsa--gfx900";
+  else if (OldName == "AMD:AMDGPU:9:0:1")
+    NewName = "amdgcn-amd-amdhsa--gfx900";
+  else if (OldName == "AMD:AMDGPU:9:0:2")
+    NewName = "amdgcn-amd-amdhsa--gfx902";
+  else if (OldName == "AMD:AMDGPU:9:0:3")
+    NewName = "amdgcn-amd-amdhsa--gfx902";
+  else if (OldName == "AMD:AMDGPU:9:0:4")
+    NewName = "amdgcn-amd-amdhsa--gfx904";
+  else if (OldName == "AMD:AMDGPU:9:0:6")
+    NewName = "amdgcn-amd-amdhsa--gfx906";
+  else
+    assert(false && "Unhandled target");
+
+  if (IsFinalizer && (EFlags & EF_AMDGPU_XNACK)) {
+    NewName = NewName + "+xnack";
+  } else {
+    if (EFlags != 0 && (EFlags & EF_AMDGPU_XNACK_LC)) {
+      NewName = NewName + "+xnack";
+    } else {
+      if (OldName == "AMD:AMDGPU:8:0:1")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:8:1:0")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:9:0:1")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:9:0:2")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:9:0:3")
+        NewName = NewName + "+xnack";
+    }
+  }
+
+  return NewName;
+}
+
 /* deprecated */
 hsa_status_t hsa_code_object_get_info(
     hsa_code_object_t code_object,
@@ -1843,8 +1908,26 @@ hsa_status_t hsa_code_object_get_info(
         return status;
       }
 
+      std::string isa_name_str(isa_name);
+
+      bool IsFinalizer = true;
+      uint32_t codeHsailMajor;
+      uint32_t codeHsailMinor;
+      hsa_profile_t codeProfile;
+      hsa_machine_model_t codeMachineModel;
+      hsa_default_float_rounding_mode_t codeRoundingMode;
+      if (!code->GetNoteHsail(&codeHsailMajor, &codeHsailMinor,
+                              &codeProfile, &codeMachineModel,
+                              &codeRoundingMode)) {
+        // Only finalizer generated the "HSAIL" note.
+        IsFinalizer = false;
+      }
+
+      std::string new_isa_name_str =
+          ConvertOldTargetNameToNew(isa_name_str, IsFinalizer, code->EFlags());
+
       hsa_isa_t isa_handle = {0};
-      status = HSA::hsa_isa_from_name(isa_name, &isa_handle);
+      status = HSA::hsa_isa_from_name(new_isa_name_str.c_str(), &isa_handle);
       if (status != HSA_STATUS_SUCCESS) {
         return status;
       }
@@ -2593,22 +2676,26 @@ hsa_status_t hsa_status_string(
       break;
     case HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER:
       *status_string =
-          "HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER:  *The code object reader is invalid.";
+          "HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER:  The code object reader is invalid.";
       break;
     case HSA_STATUS_ERROR_INVALID_CACHE:
-      *status_string = "HSA_STATUS_ERROR_INVALID_CACHE:  *The cache is invalid.";
+      *status_string = "HSA_STATUS_ERROR_INVALID_CACHE:  The cache is invalid.";
       break;
     case HSA_STATUS_ERROR_INVALID_WAVEFRONT:
-      *status_string = "HSA_STATUS_ERROR_INVALID_WAVEFRONT:  *The wavefront is invalid.";
+      *status_string = "HSA_STATUS_ERROR_INVALID_WAVEFRONT:  The wavefront is invalid.";
       break;
     case HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP:
-      *status_string = "HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP:  *The signal group is invalid.";
+      *status_string = "HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP:  The signal group is invalid.";
       break;
     case HSA_STATUS_ERROR_INVALID_RUNTIME_STATE:
       *status_string =
-          "HSA_STATUS_ERROR_INVALID_RUNTIME_STATE:  *The HSA runtime is not in the configuration "
+          "HSA_STATUS_ERROR_INVALID_RUNTIME_STATE:  The HSA runtime is not in the configuration "
           "state.";
       break;
+    case HSA_STATUS_ERROR_FATAL:
+      *status_string =
+          "HSA_STATUS_ERROR_FATAL:  The queue received an error that may require process "
+          "termination.";
     case HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED:
       *status_string =
           "HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED: Image "
diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp
index 9988dd359..0a70f0308 100644
--- a/src/core/runtime/hsa_ext_amd.cpp
+++ b/src/core/runtime/hsa_ext_amd.cpp
@@ -333,6 +333,16 @@ hsa_status_t hsa_amd_profiling_get_async_copy_time(
     return HSA_STATUS_ERROR;
   }
 
+  // Validate the embedded agent pointer if the signal is IPC.
+  if (signal->isIPC()) {
+    for (auto it : core::Runtime::runtime_singleton_->gpu_agents()) {
+      if (it == agent) break;
+    }
+    // If the agent isn't a GPU then it is from a different process or it's a CPU.
+    // Assume it's a CPU and illegal uses will generate garbage data same as kernel completion.
+    agent = core::Runtime::runtime_singleton_->cpu_agents()[0];
+  }
+
   if (agent->device_type() == core::Agent::DeviceType::kAmdGpuDevice) {
     // Translate timestamp from GPU to system domain.
     static_cast<amd::GpuAgentInt*>(agent)->TranslateTime(signal, *time);
diff --git a/src/core/runtime/intercept_queue.cpp b/src/core/runtime/intercept_queue.cpp
index 0beaddab0..4761a8f4f 100644
--- a/src/core/runtime/intercept_queue.cpp
+++ b/src/core/runtime/intercept_queue.cpp
@@ -70,7 +70,7 @@ int InterceptQueue::rtti_id_ = 0;
 InterceptQueue::InterceptQueue(std::unique_ptr<Queue> queue)
     : QueueProxy(std::move(queue)),
       LocalSignal(0),
-      Signal(signal()),
+      DoorbellSignal(signal()),
       next_packet_(0),
       retry_index_(0),
       quit_(false),
diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp
index a4a15fa03..431ed6ebf 100644
--- a/src/core/runtime/isa.cpp
+++ b/src/core/runtime/isa.cpp
@@ -67,9 +67,13 @@ bool Wavefront::GetInfo(
 
 std::string Isa::GetFullName() const {
   std::stringstream full_name;
-  full_name << GetVendor() << ":" << GetArchitecture() << ":"
-            << GetMajorVersion() << ":" << GetMinorVersion() << ":"
-            << GetStepping();
+  full_name << GetArchitecture() << "-" << GetVendor() << "-" << GetOS() << "-"
+            << GetEnvironment() << "-gfx" << GetMajorVersion()
+            << GetMinorVersion() << GetStepping();
+
+  if (xnackEnabled_)
+    full_name << "+xnack";
+
   return full_name.str();
 }
 
@@ -176,8 +180,8 @@ const Isa *IsaRegistry::GetIsa(const std::string &full_name) {
   return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second;
 }
 
-const Isa *IsaRegistry::GetIsa(const Isa::Version &version) {
-  auto isareg_iter = supported_isas_.find(Isa(version).GetFullName());
+const Isa *IsaRegistry::GetIsa(const Isa::Version &version, bool xnack) {
+  auto isareg_iter = supported_isas_.find(Isa(version, xnack).GetFullName());
   return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second;
 }
 
@@ -185,25 +189,27 @@ const IsaRegistry::IsaMap IsaRegistry::supported_isas_ =
   IsaRegistry::GetSupportedIsas();
 
 const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() {
-#define ISAREG_ENTRY_GEN(maj, min, stp)                                        \
-  Isa amd_amdgpu_##maj##min##stp;                                              \
-  amd_amdgpu_##maj##min##stp.version_ = Isa::Version(maj, min, stp);           \
-  supported_isas.insert(                                                       \
-    std::make_pair(                                                            \
-      amd_amdgpu_##maj##min##stp.GetFullName(), amd_amdgpu_##maj##min##stp));  \
+#define ISAREG_ENTRY_GEN(maj, min, stp, xnack)                                 \
+  Isa amd_amdgpu_##maj##min##stp##xnack;                                       \
+  amd_amdgpu_##maj##min##stp##xnack.version_ = Isa::Version(maj, min, stp);    \
+  amd_amdgpu_##maj##min##stp##xnack.xnackEnabled_ = xnack;                     \
+  supported_isas.insert(std::make_pair(                                        \
+      amd_amdgpu_##maj##min##stp##xnack.GetFullName(),                         \
+      amd_amdgpu_##maj##min##stp##xnack));                                     \
 
   IsaMap supported_isas;
 
-  ISAREG_ENTRY_GEN(7, 0, 0)
-  ISAREG_ENTRY_GEN(7, 0, 1)
-  ISAREG_ENTRY_GEN(7, 0, 2)
-  ISAREG_ENTRY_GEN(8, 0, 1)
-  ISAREG_ENTRY_GEN(8, 0, 2)
-  ISAREG_ENTRY_GEN(8, 0, 3)
-  ISAREG_ENTRY_GEN(9, 0, 0)
-  ISAREG_ENTRY_GEN(9, 0, 1)
-  ISAREG_ENTRY_GEN(9, 0, 2)
-  ISAREG_ENTRY_GEN(9, 0, 3)
+  ISAREG_ENTRY_GEN(7, 0, 0, false)
+  ISAREG_ENTRY_GEN(7, 0, 1, false)
+  ISAREG_ENTRY_GEN(7, 0, 2, false)
+  ISAREG_ENTRY_GEN(8, 0, 1, true)
+  ISAREG_ENTRY_GEN(8, 0, 2, false)
+  ISAREG_ENTRY_GEN(8, 0, 3, false)
+  ISAREG_ENTRY_GEN(8, 1, 0, true)
+  ISAREG_ENTRY_GEN(9, 0, 0, false)
+  ISAREG_ENTRY_GEN(9, 0, 2, true)
+  ISAREG_ENTRY_GEN(9, 0, 4, false)
+  ISAREG_ENTRY_GEN(9, 0, 6, false)
 
   return supported_isas;
 }
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index 497d9b85c..0625c6f5f 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -108,16 +108,17 @@ hsa_status_t Runtime::Acquire() {
   }
 
   runtime_singleton_->ref_count_++;
+  MAKE_NAMED_SCOPE_GUARD(refGuard, [&]() { runtime_singleton_->ref_count_--; });
 
   if (runtime_singleton_->ref_count_ == 1) {
     hsa_status_t status = runtime_singleton_->Load();
 
     if (status != HSA_STATUS_SUCCESS) {
-      runtime_singleton_->ref_count_--;
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
   }
 
+  refGuard.Dismiss();
   return HSA_STATUS_SUCCESS;
 }
 
@@ -1101,15 +1102,69 @@ bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) {
         reason += "Unknown";
       }
 
-      fprintf(stderr,
-              "Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n",
-              fault.NodeId, reinterpret_cast<const void*>(fault.VirtualAddress),
-              (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "",
-              reason.c_str());
-    } else {
-      assert(false && "GPU memory access fault.");
+      core::Agent* faultingAgent = runtime_singleton_->agents_by_node_[fault.NodeId][0];
+
+      fprintf(
+          stderr,
+          "Memory access fault by GPU node-%u (Agent handle: %p) on address %p%s. Reason: %s.\n",
+          fault.NodeId, reinterpret_cast<void*>(faultingAgent->public_handle().handle),
+          reinterpret_cast<const void*>(fault.VirtualAddress),
+          (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", reason.c_str());
+
+#ifndef NDEBUG
+      runtime_singleton_->memory_lock_.Acquire();
+      auto it = runtime_singleton_->allocation_map_.upper_bound(
+          reinterpret_cast<void*>(fault.VirtualAddress));
+      for (int i = 0; i < 2; i++) {
+        if (it != runtime_singleton_->allocation_map_.begin()) it--;
+      }
+      fprintf(stderr, "Nearby memory map:\n");
+      auto start = it;
+      for (int i = 0; i < 3; i++) {
+        if (it == runtime_singleton_->allocation_map_.end()) break;
+        std::string kind = "Non-HSA";
+        if (it->second.region != nullptr) {
+          const amd::MemoryRegion* region =
+              static_cast<const amd::MemoryRegion*>(it->second.region);
+          if (region->IsSystem())
+            kind = "System";
+          else if (region->IsLocalMemory())
+            kind = "VRAM";
+          else if (region->IsScratch())
+            kind = "Scratch";
+          else if (region->IsLDS())
+            kind = "LDS";
+        }
+        fprintf(stderr, "%p, 0x%lx, %s\n", it->first, it->second.size, kind.c_str());
+        it++;
+      }
+      fprintf(stderr, "\n");
+      it = start;
+      runtime_singleton_->memory_lock_.Release();
+      hsa_amd_pointer_info_t info;
+      PtrInfoBlockData block;
+      uint32_t count;
+      hsa_agent_t* canAccess;
+      info.size = sizeof(info);
+      for (int i = 0; i < 3; i++) {
+        if (it == runtime_singleton_->allocation_map_.end()) break;
+        runtime_singleton_->PtrInfo(const_cast<void*>(it->first), &info, malloc, &count, &canAccess,
+                                    &block);
+        fprintf(stderr,
+                "PtrInfo:\n\tAddress: %p-%p/%p-%p\n\tSize: 0x%lx\n\tType: %u\n\tOwner: %p\n",
+                info.agentBaseAddress, (char*)info.agentBaseAddress + info.sizeInBytes,
+                info.hostBaseAddress, (char*)info.hostBaseAddress + info.sizeInBytes,
+                info.sizeInBytes, info.type, reinterpret_cast<void*>(info.agentOwner.handle));
+        fprintf(stderr, "\tCanAccess: %u\n", count);
+        for (int t = 0; t < count; t++)
+          fprintf(stderr, "\t\t%p\n", reinterpret_cast<void*>(canAccess[t].handle));
+        fprintf(stderr, "\tIn block: %p, 0x%lx\n", block.base, block.length);
+        free(canAccess);
+        it++;
+      }
+#endif  //! NDEBUG
     }
-
+    assert(false && "GPU memory access fault.");
     std::abort();
   }
   // No need to keep the signal because we are done.
@@ -1167,6 +1222,9 @@ void Runtime::Unload() {
   amd::hsa::loader::Loader::Destroy(loader_);
   loader_ = nullptr;
 
+  std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
+  gpu_agents_.clear();
+
   async_events_control_.Shutdown();
 
   if (vm_fault_signal_ != nullptr) {
diff --git a/src/core/util/flag.h b/src/core/util/flag.h
index 1de3b15ab..7593bf36e 100644
--- a/src/core/util/flag.h
+++ b/src/core/util/flag.h
@@ -69,8 +69,7 @@ class Flag {
     var = os::GetEnvVar("HSA_ENABLE_INTERRUPT");
     enable_interrupt_ = (var == "0") ? false : true;
 
-    var = os::GetEnvVar("HSA_ENABLE_SDMA");
-    enable_sdma_ = (var == "0") ? false : true;
+    enable_sdma_ = os::GetEnvVar("HSA_ENABLE_SDMA");
 
     var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND");
     running_valgrind_ = (var == "1") ? true : false;
@@ -91,17 +90,20 @@ class Flag {
 
     var = os::GetEnvVar("HSA_DISABLE_FRAGMENT_ALLOCATOR");
     disable_fragment_alloc_ = (var == "1") ? true : false;
+
+    var = os::GetEnvVar("HSA_ENABLE_SDMA_HDP_FLUSH");
+    enable_sdma_hdp_flush_ = (var == "0") ? false : true;
   }
 
   bool check_flat_scratch() const { return check_flat_scratch_; }
 
   bool enable_vm_fault_message() const { return enable_vm_fault_message_; }
-  
+
   bool enable_queue_fault_message() const { return enable_queue_fault_message_; }
 
   bool enable_interrupt() const { return enable_interrupt_; }
 
-  bool enable_sdma() const { return enable_sdma_; }
+  bool enable_sdma_hdp_flush() const { return enable_sdma_hdp_flush_; }
 
   bool running_valgrind() const { return running_valgrind_; }
 
@@ -111,6 +113,8 @@ class Flag {
 
   bool disable_fragment_alloc() const { return disable_fragment_alloc_; }
 
+  std::string enable_sdma() const { return enable_sdma_; }
+
   uint32_t max_queues() const { return max_queues_; }
 
   size_t scratch_mem_size() const { return scratch_mem_size_; }
@@ -121,13 +125,15 @@ class Flag {
   bool check_flat_scratch_;
   bool enable_vm_fault_message_;
   bool enable_interrupt_;
-  bool enable_sdma_;
+  bool enable_sdma_hdp_flush_;
   bool running_valgrind_;
   bool sdma_wait_idle_;
   bool enable_queue_fault_message_;
   bool report_tool_load_failures_;
   bool disable_fragment_alloc_;
 
+  std::string enable_sdma_;
+
   uint32_t max_queues_;
 
   size_t scratch_mem_size_;
diff --git a/src/core/util/lazy_ptr.h b/src/core/util/lazy_ptr.h
new file mode 100644
index 000000000..7837200d8
--- /dev/null
+++ b/src/core/util/lazy_ptr.h
@@ -0,0 +1,125 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+
+#include <memory>
+#include <utility>
+#include <functional>
+
+#include "core/util/utils.h"
+
+/*
+ * Wrapper for a std::unique_ptr that initializes its object at first use.
+ */
+template <typename T> class lazy_ptr {
+ public:
+  lazy_ptr() {}
+
+  explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
+
+  void reset(std::function<T*()> Constructor = nullptr) {
+    obj.reset();
+    func = Constructor;
+  }
+
+  void reset(T* ptr) {
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  bool operator==(T* rhs) const { return obj.get() == rhs; }
+  bool operator!=(T* rhs) const { return obj.get() != rhs; }
+
+  const std::unique_ptr<T>& operator->() const {
+    make(true);
+    return obj;
+  }
+
+  std::unique_ptr<T>& operator*() {
+    make(true);
+    return obj;
+  }
+
+  const std::unique_ptr<T>& operator*() const {
+    make(true);
+    return obj;
+  }
+
+  /*
+   * Ensures that the object is created or is being created.
+   * This is useful when early consruction of the object is required.
+   */
+  void touch() const { make(false); }
+
+ private:
+  mutable std::unique_ptr<T> obj;
+  mutable std::function<T*(void)> func;
+  mutable KernelMutex lock;
+
+  // Separated from make to improve inlining.
+  void make_body(bool block) const {
+    if (block) {
+      lock.Acquire();
+    } else if (!lock.Try()) {
+      return;
+    }
+    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    if (obj != nullptr) return;
+    T* ptr = func();
+    std::atomic_thread_fence(std::memory_order_release);
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  __forceinline void make(bool block) const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    if (obj == nullptr) {
+      make_body(block);
+    }
+  }
+
+  DISALLOW_COPY_AND_ASSIGN(lazy_ptr);
+};
+
+#endif  // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
diff --git a/src/core/util/small_heap.cpp b/src/core/util/small_heap.cpp
index 9b8998bf3..6cd8e117d 100644
--- a/src/core/util/small_heap.cpp
+++ b/src/core/util/small_heap.cpp
@@ -42,25 +42,47 @@
 
 #include "small_heap.h"
 
-SmallHeap::memory_t::iterator SmallHeap::merge(
-    SmallHeap::memory_t::iterator& keep,
-    SmallHeap::memory_t::iterator& destroy) {
-  assert((char*)keep->first + keep->second.len == (char*)destroy->first &&
-         "Invalid merge");
-  assert(keep->second.isfree() && "Merge with allocated block");
-  assert(destroy->second.isfree() && "Merge with allocated block");
-
-  keep->second.len += destroy->second.len;
-  keep->second.next_free = destroy->second.next_free;
-  if (!destroy->second.islastfree())
-    memory[destroy->second.next_free].prior_free = keep->first;
-
-  memory.erase(destroy);
-  return keep;
+// Inserts node into freelist after place.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) {
+  assert(place->first < node->first && "Order violation");
+  assert(isfree(place->second) && "Freelist operation error.");
+  iterator_t next = place->second.next;
+  node->second.next = next;
+  node->second.prior = place;
+  place->second.next = node;
+  next->second.prior = node;
+}
+
+// Removes node from freelist.
+// Assumes node will not be an end of the list (list has guard nodes).
+void SmallHeap::remove(SmallHeap::iterator_t node) {
+  assert(isfree(node->second) && "Freelist operation error.");
+  node->second.prior->second.next = node->second.next;
+  node->second.next->second.prior = node->second.prior;
+  setused(node->second);
+}
+
+// Returns high if merge failed or the merged node.
+SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low,
+                                               SmallHeap::memory_t::iterator high) {
+  assert(isfree(low->second) && "Merge with allocated block");
+  assert(isfree(high->second) && "Merge with allocated block");
+
+  if ((char*)low->first + low->second.len != (char*)high->first) return high;
+
+  assert(!islastfree(high->second) && "Illegal merge.");
+
+  low->second.len += high->second.len;
+  low->second.next = high->second.next;
+  high->second.next->second.prior = low;
+
+  memory.erase(high);
+  return low;
 }
 
 void SmallHeap::free(void* ptr) {
-  if (ptr == NULL) return;
+  if (ptr == nullptr) return;
 
   auto iterator = memory.find(ptr);
 
@@ -70,105 +92,90 @@ void SmallHeap::free(void* ptr) {
     return;
   }
 
-  const auto start_guard = memory.find(0);
-  const auto end_guard = memory.find((void*)0xFFFFFFFFFFFFFFFFull);
-
   // Return memory to total and link node into free list
   total_free += iterator->second.len;
-  if (first_free < iterator->first) {
-    auto before = iterator;
-    before--;
-    while (before != start_guard && !before->second.isfree()) before--;
-    assert(before->second.next_free > iterator->first &&
-           "Inconsistency in small heap.");
-    iterator->second.prior_free = before->first;
-    iterator->second.next_free = before->second.next_free;
-    before->second.next_free = iterator->first;
-    if (!iterator->second.islastfree())
-      memory[iterator->second.next_free].prior_free = iterator->first;
-  } else {
-    iterator->second.setfirstfree();
-    iterator->second.next_free = first_free;
-    first_free = iterator->first;
-    if (!iterator->second.islastfree())
-      memory[iterator->second.next_free].prior_free = iterator->first;
-  }
 
-  // Attempt compaction
+  // Could also traverse the free list which might be faster in some cases.
   auto before = iterator;
   before--;
-  if (before != start_guard) {
-    if (before->second.isfree()) {
-      iterator = merge(before, iterator);
-    }
-  }
+  while (!isfree(before->second)) before--;
+  assert(before->second.next->first > iterator->first && "Inconsistency in small heap.");
+  insertafter(before, iterator);
 
-  auto after = iterator;
-  after++;
-  if (after != end_guard) {
-    if (after->second.isfree()) {
-      iterator = merge(iterator, after);
-    }
-  }
+  // Attempt compaction
+  iterator = merge(before, iterator);
+  merge(iterator, iterator->second.next);
+
+  // Update lowHighBondary
+  high.erase(ptr);
 }
 
 void* SmallHeap::alloc(size_t bytes) {
   // Is enough memory available?
-  if ((bytes > total_free) || (bytes == 0)) return NULL;
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
 
-  memory_t::iterator current;
-  memory_t::iterator prior;
+  iterator_t current;
 
   // Walk the free list and allocate at first fitting location
-  prior = current = memory.find(first_free);
-  while (true) {
+  current = firstfree();
+  while (!islastfree(current->second)) {
     if (bytes <= current->second.len) {
       // Decrement from total
       total_free -= bytes;
 
-      // Is allocation an exact fit?
-      if (bytes == current->second.len) {
-        if (prior == current) {
-          first_free = current->second.next_free;
-          if (!current->second.islastfree())
-            memory[current->second.next_free].setfirstfree();
-        } else {
-          prior->second.next_free = current->second.next_free;
-          if (!current->second.islastfree())
-            memory[current->second.next_free].prior_free = prior->first;
-        }
-        current->second.next_free = NULL;
-        return current->first;
-      } else {
-        // Split current node
+      // Split node
+      if (bytes != current->second.len) {
         void* remaining = (char*)current->first + bytes;
         Node& node = memory[remaining];
-        node.next_free = current->second.next_free;
-        node.prior_free = current->second.prior_free;
         node.len = current->second.len - bytes;
         current->second.len = bytes;
-
-        if (prior == current) {
-          first_free = remaining;
-          node.setfirstfree();
-        } else {
-          prior->second.next_free = remaining;
-          node.prior_free = prior->first;
-        }
-        if (!node.islastfree()) memory[node.next_free].prior_free = remaining;
-
-        current->second.next_free = NULL;
-        return current->first;
+        insertafter(current, memory.find(remaining));
       }
+
+      remove(current);
+      return current->first;
     }
+    current = current->second.next;
+  }
+  assert(current->second.len == 0 && "Freelist corruption.");
 
-    // End of free list?
-    if (current->second.islastfree()) break;
+  // Can't service the request due to fragmentation
+  return nullptr;
+}
 
-    prior = current;
-    current = memory.find(current->second.next_free);
+void* SmallHeap::alloc_high(size_t bytes) {
+  // Is enough memory available?
+  if ((bytes > total_free) || (bytes == 0)) return nullptr;
+
+  iterator_t current;
+
+  // Walk the free list and allocate at first fitting location
+  current = lastfree();
+  while (!isfirstfree(current->second)) {
+    if (bytes <= current->second.len) {
+      // Decrement from total
+      total_free -= bytes;
+
+      void* alloc;
+      // Split node
+      if (bytes != current->second.len) {
+        alloc = (char*)current->first + current->second.len - bytes;
+        current->second.len -= bytes;
+        Node& node = memory[alloc];
+        node.len = bytes;
+        setused(node);
+      } else {
+        alloc = current->first;
+        remove(current);
+      }
+
+      high.insert(alloc);
+      return alloc;
+    }
+    current = current->second.prior;
   }
+  assert(current->second.len == 0 && "Freelist corruption.");
 
   // Can't service the request due to fragmentation
-  return NULL;
+  return nullptr;
 }
diff --git a/src/core/util/small_heap.h b/src/core/util/small_heap.h
index fecfe857f..7a366564a 100644
--- a/src/core/util/small_heap.h
+++ b/src/core/util/small_heap.h
@@ -50,65 +50,78 @@
 #include "utils.h"
 
 #include <map>
+#include <set>
 
 class SmallHeap {
- public:
-  class Node {
-   public:
+ private:
+  struct Node;
+  typedef std::map<void*, Node> memory_t;
+  typedef memory_t::iterator iterator_t;
+
+  struct Node {
     size_t len;
-    void* next_free;
-    void* prior_free;
-    static const intptr_t END = -1;
-
-    __forceinline bool isfree() const { return next_free != NULL; }
-    __forceinline bool islastfree() const { return intptr_t(next_free) == END; }
-    __forceinline bool isfirstfree() const {
-      return intptr_t(prior_free) == END;
-    }
-    __forceinline void setlastfree() {
-      *reinterpret_cast<intptr_t*>(&next_free) = END;
-    }
-    __forceinline void setfirstfree() {
-      *reinterpret_cast<intptr_t*>(&prior_free) = END;
-    }
+    iterator_t next;
+    iterator_t prior;
   };
 
- private:
-  SmallHeap(const SmallHeap& rhs);
-  SmallHeap& operator=(const SmallHeap& rhs);
+  SmallHeap(const SmallHeap& rhs) = delete;
+  SmallHeap& operator=(const SmallHeap& rhs) = delete;
 
   void* const pool;
   const size_t length;
 
   size_t total_free;
-  void* first_free;
-  std::map<void*, Node> memory;
-
-  typedef decltype(memory) memory_t;
-  memory_t::iterator merge(memory_t::iterator& keep,
-                           memory_t::iterator& destroy);
+  memory_t memory;
+  std::set<void*> high;
+
+  __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); }
+  __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); }
+  __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); }
+  __forceinline void setlastfree(Node& node) { node.next = memory.end(); }
+  __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); }
+  __forceinline void setused(Node& node) { node.next = memory.begin(); }
+
+  __forceinline iterator_t firstfree() { return memory.begin()->second.next; }
+  __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; }
+  void insertafter(iterator_t place, iterator_t node);
+  void remove(iterator_t node);
+  iterator_t merge(iterator_t low, iterator_t high);
 
  public:
-  SmallHeap() : pool(NULL), length(0), total_free(0) {}
+  SmallHeap() : pool(nullptr), length(0), total_free(0) {}
   SmallHeap(void* base, size_t length)
       : pool(base), length(length), total_free(length) {
-    first_free = pool;
+    assert(pool != nullptr && "Invalid base address.");
+    assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address.");
+    assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds.");
+
+    Node& start = memory[0];
+    Node& node = memory[pool];
+    Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull];
+
+    start.len = 0;
+    start.next = memory.find(pool);
+    setfirstfree(start);
 
-    Node& node = memory[first_free];
     node.len = length;
-    node.setlastfree();
-    node.setfirstfree();
+    node.prior = memory.begin();
+    node.next = --memory.end();
+
+    end.len = 0;
+    end.prior = start.next;
+    setlastfree(end);
 
-    memory[0].len = 0;
-    memory[(void*)0xFFFFFFFFFFFFFFFFull].len = 0;
+    high.insert((void*)0xFFFFFFFFFFFFFFFFull);
   }
 
   void* alloc(size_t bytes);
+  void* alloc_high(size_t bytes);
   void free(void* ptr);
 
   void* base() const { return pool; }
   size_t size() const { return length; }
   size_t remaining() const { return total_free; }
+  void* high_split() const { return *high.begin(); }
 };
 
 #endif
diff --git a/src/inc/amd_hsa_common.h b/src/inc/amd_hsa_common.h
index ca6a2b983..bfb613ec4 100644
--- a/src/inc/amd_hsa_common.h
+++ b/src/inc/amd_hsa_common.h
@@ -75,8 +75,8 @@
 // Creates enumeration entries for packed types. Enumeration entries include
 // bit shift amount, bit width, and bit mask.
 #define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width)                   \
-  name ## _SHIFT = (shift),                                                    \
-  name ## _WIDTH = (width),                                                    \
+  name##_SHIFT = (shift),                                                      \
+  name##_WIDTH = (width),                                                      \
   name = (((1 << (width)) - 1) << (shift))                                     \
 
 // Gets bits for specified mask from specified src packed instance.
@@ -85,7 +85,7 @@
 
 // Sets val bits for specified mask in specified dst packed instance.
 #define AMD_HSA_BITS_SET(dst, mask, val)                                       \
-  dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
-  dst |= (((val) << mask ## _SHIFT) & mask)                                    \
+  dst &= (~(1 << mask##_SHIFT) & ~mask);                                       \
+  dst |= (((val) << mask##_SHIFT) & mask)                                      \
 
 #endif // AMD_HSA_COMMON_H
diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h
index 95f89c635..e0702b175 100644
--- a/src/inc/amd_hsa_elf.h
+++ b/src/inc/amd_hsa_elf.h
@@ -52,6 +52,10 @@
 #define EF_AMDGPU_XNACK          0x00000001
 #define EF_AMDGPU_TRAP_HANDLER   0x00000002
 
+// FIXME: We really need to start thinking about separating legacy code out,
+// it is getting messy.
+#define EF_AMDGPU_XNACK_LC 0x100
+
 // ELF Section Header Flag Enumeration Values.
 #define SHF_AMDGPU_HSA_GLOBAL   (0x00100000 & SHF_MASKOS)
 #define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS)
@@ -116,6 +120,7 @@ typedef enum {
 #define R_AMDGPU_64           3
 #define R_AMDGPU_INIT_SAMPLER 4
 #define R_AMDGPU_INIT_IMAGE   5
+#define R_AMDGPU_RELATIVE64   13
 
 // AMD GPU Note Type Enumeration Values.
 #define NT_AMDGPU_HSA_CODE_OBJECT_VERSION 1
diff --git a/src/inc/amd_hsa_queue.h b/src/inc/amd_hsa_queue.h
index b37bb53f3..2176e8470 100644
--- a/src/inc/amd_hsa_queue.h
+++ b/src/inc/amd_hsa_queue.h
@@ -53,7 +53,8 @@ enum amd_queue_properties_t {
   AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1),
   AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1),
   AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1),
-  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 4, 28)
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1),
+  AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27)
 };
 
 // AMD Queue.
diff --git a/src/inc/hsa.h b/src/inc/hsa.h
index b8ec90a66..0ed2b689e 100644
--- a/src/inc/hsa.h
+++ b/src/inc/hsa.h
@@ -116,148 +116,151 @@ extern "C" {
  * @brief Status codes.
  */
 typedef enum {
-    /**
-     * The function has been executed successfully.
-     */
-    HSA_STATUS_SUCCESS = 0x0,
-    /**
-     * A traversal over a list of elements has been interrupted by the
-     * application before completing.
-     */
-    HSA_STATUS_INFO_BREAK = 0x1,
-    /**
-     * A generic error has occurred.
-     */
-    HSA_STATUS_ERROR = 0x1000,
-    /**
-     * One of the actual arguments does not meet a precondition stated in the
-     * documentation of the corresponding formal argument.
-     */
-    HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001,
-    /**
-     * The requested queue creation is not valid.
-     */
-    HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002,
-    /**
-     * The requested allocation is not valid.
-     */
-    HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003,
-    /**
-     * The agent is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_AGENT = 0x1004,
-    /**
-     * The memory region is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_REGION = 0x1005,
-    /**
-     * The signal is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006,
-    /**
-     * The queue is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007,
-    /**
-     * The HSA runtime failed to allocate the necessary resources. This error
-     * may also occur when the HSA runtime needs to spawn threads or create
-     * internal OS-specific events.
-     */
-    HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008,
-    /**
-     * The AQL packet is malformed.
-     */
-    HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009,
-    /**
-     * An error has been detected while releasing a resource.
-     */
-    HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A,
-    /**
-     * An API other than ::hsa_init has been invoked while the reference count
-     * of the HSA runtime is 0.
-     */
-    HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
-    /**
-     * The maximum reference count for the object has been reached.
-     */
-    HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C,
-    /**
-     * The arguments passed to a functions are not compatible.
-     */
-    HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D,
-    /**
-     * The index is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_INDEX = 0x100E,
-    /**
-     * The instruction set architecture is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
-    /**
-     * The instruction set architecture name is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017,
-    /**
-     * The code object is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
-    /**
-     * The executable is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011,
-    /**
-     * The executable is frozen.
-     */
-    HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012,
-    /**
-     * There is no symbol with the given name.
-     */
-    HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013,
-    /**
-     * The variable is already defined.
-     */
-    HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014,
-    /**
-     * The variable is undefined.
-     */
-    HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015,
-    /**
-     * An HSAIL operation resulted in a hardware exception.
-     */
-    HSA_STATUS_ERROR_EXCEPTION = 0x1016,
-    /**
-     * The code object symbol is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018,
-    /**
-     * The executable symbol is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019,
-    /**
-     * The file descriptor is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_FILE = 0x1020,
-    /**
-     * The code object reader is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021,
-    /**
-     * The cache is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_CACHE = 0x1022,
-    /**
-     * The wavefront is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023,
-    /**
-     * The signal group is invalid.
-     */
-    HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024,
-    /**
-     * The HSA runtime is not in the configuration state.
-     */
-    HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025
-
+  /**
+   * The function has been executed successfully.
+   */
+  HSA_STATUS_SUCCESS = 0x0,
+  /**
+   * A traversal over a list of elements has been interrupted by the
+   * application before completing.
+   */
+  HSA_STATUS_INFO_BREAK = 0x1,
+  /**
+   * A generic error has occurred.
+   */
+  HSA_STATUS_ERROR = 0x1000,
+  /**
+   * One of the actual arguments does not meet a precondition stated in the
+   * documentation of the corresponding formal argument.
+   */
+  HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001,
+  /**
+   * The requested queue creation is not valid.
+   */
+  HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002,
+  /**
+   * The requested allocation is not valid.
+   */
+  HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003,
+  /**
+   * The agent is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_AGENT = 0x1004,
+  /**
+   * The memory region is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_REGION = 0x1005,
+  /**
+   * The signal is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006,
+  /**
+   * The queue is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007,
+  /**
+   * The HSA runtime failed to allocate the necessary resources. This error
+   * may also occur when the HSA runtime needs to spawn threads or create
+   * internal OS-specific events.
+   */
+  HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008,
+  /**
+   * The AQL packet is malformed.
+   */
+  HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009,
+  /**
+   * An error has been detected while releasing a resource.
+   */
+  HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A,
+  /**
+   * An API other than ::hsa_init has been invoked while the reference count
+   * of the HSA runtime is 0.
+   */
+  HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  /**
+   * The maximum reference count for the object has been reached.
+   */
+  HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C,
+  /**
+   * The arguments passed to a functions are not compatible.
+   */
+  HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D,
+  /**
+   * The index is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_INDEX = 0x100E,
+  /**
+   * The instruction set architecture is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
+  /**
+   * The instruction set architecture name is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017,
+  /**
+   * The code object is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
+  /**
+   * The executable is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011,
+  /**
+   * The executable is frozen.
+   */
+  HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012,
+  /**
+   * There is no symbol with the given name.
+   */
+  HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013,
+  /**
+   * The variable is already defined.
+   */
+  HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014,
+  /**
+   * The variable is undefined.
+   */
+  HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015,
+  /**
+   * An HSAIL operation resulted in a hardware exception.
+   */
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
+  /**
+   * The code object symbol is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018,
+  /**
+   * The executable symbol is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019,
+  /**
+   * The file descriptor is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_FILE = 0x1020,
+  /**
+   * The code object reader is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021,
+  /**
+   * The cache is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_CACHE = 0x1022,
+  /**
+   * The wavefront is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023,
+  /**
+   * The signal group is invalid.
+   */
+  HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024,
+  /**
+   * The HSA runtime is not in the configuration state.
+   */
+  HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025,
+  /**
+  * The queue received an error that may require process termination.
+  */
+  HSA_STATUS_ERROR_FATAL = 0x1026
 } hsa_status_t;
 
 /**
diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h
old mode 100755
new mode 100644
index 78db49608..b4b95e2f3
--- a/src/inc/hsa_ext_amd.h
+++ b/src/inc/hsa_ext_amd.h
@@ -299,9 +299,8 @@ hsa_status_t HSA_API
  * @brief Retrieve packet processing time stamps.
  *
  * @param[in] agent The agent with which the signal was last used.  For
- *instance,
- * if the profiled dispatch packet is dispatched on to queue Q, which was
- * created on agent A, then this parameter must be A.
+ * instance, if the profiled dispatch packet is dispatched onto queue Q,
+ * which was created on agent A, then this parameter must be A.
  *
  * @param[in] signal A signal used as the completion signal of the dispatch
  * packet to retrieve time stamps from.  This dispatch packet must have been
@@ -388,8 +387,10 @@ typedef enum {
   HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1,
   /**
    * Signal may be used for interprocess communication.
-   * This signal may not be used with profiling APIs.  Errors or inaccurate
-   * timing data may result from such use.
+   * IPC signals can be read, written, and waited on from any process.
+   * Profiling using an IPC enabled signal is only supported in a single process
+   * at a time.  Producing profiling data in one process and consuming it in
+   * another process is undefined.
    */
   HSA_AMD_SIGNAL_IPC = 2,
 } hsa_amd_signal_attribute_t;
diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp
index 80c25d520..ee9ff3135 100644
--- a/src/libamdhsacode/amd_elf_image.cpp
+++ b/src/libamdhsacode/amd_elf_image.cpp
@@ -703,6 +703,7 @@ namespace amd {
 
       uint16_t Machine() override { return ehdr.e_machine; }
       uint16_t Type() override { return ehdr.e_type; }
+      uint32_t EFlags() override{ return ehdr.e_flags; }
 
       GElfStringTable* shstrtab() override;
       GElfStringTable* strtab() override;
diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp
index 7b88e0d4a..eb3511f18 100644
--- a/src/loader/executable.cpp
+++ b/src/loader/executable.cpp
@@ -1016,6 +1016,71 @@ static uint32_t NextCodeObjectNum()
   return dumpN++;
 }
 
+static std::string ConvertOldTargetNameToNew(
+    const std::string &OldName, bool IsFinalizer, uint32_t EFlags) {
+  std::string NewName = "";
+
+  // FIXME #1: Should 9:0:3 be completely (loader, sc, etc.) removed?
+  // FIXME #2: What does PAL do with respect to boltzmann/usual fiji/tonga?
+  if (OldName == "AMD:AMDGPU:7:0:0")
+    NewName = "amdgcn-amd-amdhsa--gfx700";
+  else if (OldName == "AMD:AMDGPU:7:0:1")
+    NewName = "amdgcn-amd-amdhsa--gfx701";
+  else if (OldName == "AMD:AMDGPU:7:0:2")
+    NewName = "amdgcn-amd-amdhsa--gfx702";
+  else if (OldName == "AMD:AMDGPU:7:0:3")
+    NewName = "amdgcn-amd-amdhsa--gfx703";
+  else if (OldName == "AMD:AMDGPU:7:0:4")
+    NewName = "amdgcn-amd-amdhsa--gfx704";
+  else if (OldName == "AMD:AMDGPU:8:0:0")
+    NewName = "amdgcn-amd-amdhsa--gfx800";
+  else if (OldName == "AMD:AMDGPU:8:0:1")
+    NewName = "amdgcn-amd-amdhsa--gfx801";
+  else if (OldName == "AMD:AMDGPU:8:0:2")
+    NewName = "amdgcn-amd-amdhsa--gfx802";
+  else if (OldName == "AMD:AMDGPU:8:0:3")
+    NewName = "amdgcn-amd-amdhsa--gfx803";
+  else if (OldName == "AMD:AMDGPU:8:0:4")
+    NewName = "amdgcn-amd-amdhsa--gfx804";
+  else if (OldName == "AMD:AMDGPU:8:1:0")
+    NewName = "amdgcn-amd-amdhsa--gfx810";
+  else if (OldName == "AMD:AMDGPU:9:0:0")
+    NewName = "amdgcn-amd-amdhsa--gfx900";
+  else if (OldName == "AMD:AMDGPU:9:0:1")
+    NewName = "amdgcn-amd-amdhsa--gfx900";
+  else if (OldName == "AMD:AMDGPU:9:0:2")
+    NewName = "amdgcn-amd-amdhsa--gfx902";
+  else if (OldName == "AMD:AMDGPU:9:0:3")
+    NewName = "amdgcn-amd-amdhsa--gfx902";
+  else if (OldName == "AMD:AMDGPU:9:0:4")
+    NewName = "amdgcn-amd-amdhsa--gfx904";
+  else if (OldName == "AMD:AMDGPU:9:0:6")
+    NewName = "amdgcn-amd-amdhsa--gfx906";
+  else
+    assert(false && "Unhandled target");
+
+  if (IsFinalizer && (EFlags & EF_AMDGPU_XNACK)) {
+    NewName = NewName + "+xnack";
+  } else {
+    if (EFlags != 0 && (EFlags & EF_AMDGPU_XNACK_LC)) {
+      NewName = NewName + "+xnack";
+    } else {
+      if (OldName == "AMD:AMDGPU:8:0:1")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:8:1:0")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:9:0:1")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:9:0:2")
+        NewName = NewName + "+xnack";
+      else if (OldName == "AMD:AMDGPU:9:0:3")
+        NewName = NewName + "+xnack";
+    }
+  }
+
+  return NewName;
+}
+
 hsa_status_t ExecutableImpl::LoadCodeObject(
   hsa_agent_t agent,
   hsa_code_object_t code_object,
@@ -1110,9 +1175,6 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
   std::string codeIsa;
   if (!code->GetNoteIsa(codeIsa)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; }
 
-  hsa_isa_t objectsIsa = context_->IsaFromName(codeIsa.c_str());
-  if (!objectsIsa.handle) { return HSA_STATUS_ERROR_INVALID_ISA_NAME; }
-
   uint32_t majorVersion, minorVersion;
   if (!code->GetNoteCodeObjectVersion(&majorVersion, &minorVersion)) {
     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
@@ -1120,20 +1182,30 @@ hsa_status_t ExecutableImpl::LoadCodeObject(
 
   if (majorVersion != 1 && majorVersion != 2) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; }
   if (agent.handle == 0 && majorVersion == 1) { return HSA_STATUS_ERROR_INVALID_AGENT; }
-  if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa)) { return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; }
 
+  bool IsFinalizer = true;
   uint32_t codeHsailMajor;
   uint32_t codeHsailMinor;
   hsa_profile_t codeProfile;
   hsa_machine_model_t codeMachineModel;
   hsa_default_float_rounding_mode_t codeRoundingMode;
   if (!code->GetNoteHsail(&codeHsailMajor, &codeHsailMinor, &codeProfile, &codeMachineModel, &codeRoundingMode)) {
+    // Only finalizer generated the "HSAIL" note.
+    IsFinalizer = false;
     codeProfile = HSA_PROFILE_FULL;
   }
   if (profile_ != codeProfile) {
     return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
   }
 
+  codeIsa = ConvertOldTargetNameToNew(codeIsa, IsFinalizer, code->EFlags());
+  hsa_isa_t objectsIsa = context_->IsaFromName(codeIsa.c_str());
+  if (!objectsIsa.handle) { return HSA_STATUS_ERROR_INVALID_ISA_NAME; }
+
+  if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa)) {
+    return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
+  }
+
   hsa_status_t status;
 
   objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize()));
@@ -1673,14 +1745,16 @@ hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa:
       break;
     }
 
-    case R_AMDGPU_INIT_IMAGE:
-    case R_AMDGPU_INIT_SAMPLER:
-      // Images and samplers are not supported in v2.1.
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+    case R_AMDGPU_RELATIVE64:
+    {
+      int64_t baseDelta = reinterpret_cast<uint64_t>(relSeg->Address(0)) - relSeg->VAddr();
+      uint64_t relocatedAddr = baseDelta + rel->addend();
+      relSeg->Copy(rel->offset(), &relocatedAddr, sizeof(relocatedAddr));
+      break;
+    }
 
     default:
-      // Ignore.
-      break;
+      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
   }
   return HSA_STATUS_SUCCESS;
 }
diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp
old mode 100755
new mode 100644
index 7a795acce..624a8c894
--- a/src/loader/loaders.cpp
+++ b/src/loader/loaders.cpp
@@ -87,22 +87,16 @@ namespace loader {
     gfx803.handle = 803;
     gfx804.handle = 804;
     gfx810.handle = 810;
-#if defined(GFX9_BUILD)
     gfx900.handle = 900;
     gfx901.handle = 901;
-    gfx901.handle = 902;
+    gfx902.handle = 902;
     gfx903.handle = 903;
-    gfx903.handle = 904;
-    gfx903.handle = 905;
-    gfx903.handle = 906;
-    gfx907.handle = 907;
-#endif // GFX9_BUILD
   }
 
   hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name)
   {
     std::string sname(name);
-           if (sname == "AMD:AMDGPU:7:0:0") {
+    if (sname == "AMD:AMDGPU:7:0:0") {
       return gfx700;
     } else if (sname == "AMD:AMDGPU:7:0:1") {
       return gfx701;
@@ -118,7 +112,6 @@ namespace loader {
       return gfx804;
     } else if (sname == "AMD:AMDGPU:8:1:0") {
       return gfx810;
-#if defined(GFX9_BUILD)
     } else if (sname == "AMD:AMDGPU:9:0:0") {
       return gfx900;
     } else if (sname == "AMD:AMDGPU:9:0:1") {
@@ -127,19 +120,10 @@ namespace loader {
       return gfx902;
     } else if (sname == "AMD:AMDGPU:9:0:3") {
       return gfx903;
-    } else if (sname == "AMD:AMDGPU:9:0:4") {
-      return gfx904;
-    } else if (sname == "AMD:AMDGPU:9:0:5") {
-      return gfx905;
-    } else if (sname == "AMD:AMDGPU:9:0:6") {
-      return gfx906;
-    } else if (sname == "AMD:AMDGPU:9:0:7") {
-      return gfx907;
-#endif // GFX_BUILD
-    } else {
-      assert(0);
-      return invalid;
     }
+
+    assert(0);
+    return invalid;
   }
 
   bool OfflineLoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)
diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp
index 81e13fbc6..94b3ceca3 100644
--- a/src/loader/loaders.hpp
+++ b/src/loader/loaders.hpp
@@ -55,11 +55,7 @@ namespace loader {
   private:
     hsa_isa_t invalid;
     hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810;
-#if defined(GFX9_BUILD)
-    hsa_isa_t gfx900, gfx901;
-#else
-    hsa_isa_t reserved;
-#endif // GFX9_BUILD
+    hsa_isa_t gfx900, gfx901, gfx902, gfx903;
     std::ostream& out;
     typedef std::set<void*> PointerSet;
     PointerSet pointers;