diff --git a/cmake/ROCclr.cmake b/cmake/ROCclr.cmake
index 4d8f6a25..c7795ba8 100644
--- a/cmake/ROCclr.cmake
+++ b/cmake/ROCclr.cmake
@@ -100,7 +100,6 @@ endif()
 
 target_compile_definitions(rocclr PUBLIC
   LITTLEENDIAN_CPU
-  WITH_LIQUID_FLASH=0
   ${AMD_OPENCL_DEFS})
 
 target_include_directories(rocclr PUBLIC
diff --git a/device/blit.cpp b/device/blit.cpp
index 3c6b5dbf..a2c25376 100644
--- a/device/blit.cpp
+++ b/device/blit.cpp
@@ -729,14 +729,16 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz
                                            std::vector<FillBufferInfo>& packed_info) {
 
   // 1. Validate input arguments
-  guarantee(fill_size >= pattern_size, "Pattern Size cannot be greater than fill size");
-  guarantee(fill_size <= memory.size(), "Cannot fill more than the mem object size");
+  guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
+                                        pattern_size, fill_size);
+  guarantee(fill_size <= memory.size(), "Cannot fill: %u more than the mem object size:%u \n",
+                                        fill_size, memory.size());
 
   // 2. Calculate the next closest dword aligned address for faster processing
   size_t dst_addr = memory.virtualAddress() + fill_origin;
   size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
-  guarantee(aligned_dst_addr >= dst_addr, "Aligned address cannot be greater than destination"
-                                          "address");
+  guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
+                                          "address :%u \n", aligned_dst_addr, dst_addr);
 
   // 3. If given address is not aligned calculate head and tail size.
   size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);
diff --git a/device/devhcprintf.cpp b/device/devhcprintf.cpp
index b36f7758..a5aba4a4 100644
--- a/device/devhcprintf.cpp
+++ b/device/devhcprintf.cpp
@@ -23,6 +23,7 @@
 
 #include <assert.h>
 #include <cstdarg>
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <string>
diff --git a/device/devhostcall.cpp b/device/devhostcall.cpp
index 8226e3da..1729e5eb 100644
--- a/device/devhostcall.cpp
+++ b/device/devhostcall.cpp
@@ -84,10 +84,11 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       if (!messages.handlePayload(service, payload)) {
         ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: invalid request for service \"%d\".",
                 service);
-        amd::report_fatal(__FILE__, __LINE__, "Hostcall: invalid service request.");
+        guarantee(false, "Hostcall: invalid service request %d \n", service);
       }
       return;
     case SERVICE_DEVMEM: {
+      guarantee(payload[0] != 0 || payload[1] != 0, "Both payloads cannot be 0 \n");
       if (payload[0]) {
         amd::Memory* mem = amd::MemObjMap::FindMemObj(reinterpret_cast<void*>(payload[0]));
         if (mem) {
@@ -114,9 +115,7 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t*
       return;
     }
     default:
-      ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: no handler found for service ID \"%d\".",
-              service);
-      amd::report_fatal(__FILE__, __LINE__, "Hostcall service not supported.");
+      guarantee(false, "Hostcall: no handler found for service ID %d \n", service);
       return;
   }
 }
diff --git a/device/device.cpp b/device/device.cpp
index 2709febd..f87452db 100644
--- a/device/device.cpp
+++ b/device/device.cpp
@@ -289,11 +289,8 @@ void MemObjMap::AddMemObj(const void* k, amd::Memory* v) {
 void MemObjMap::RemoveMemObj(const void* k) {
   amd::ScopedLock lock(AllocatedLock_);
   auto rval = MemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
-  if (rval != 1) {
-    DevLogPrintfError("Memobj map does not have ptr: 0x%x",
-                      reinterpret_cast<uintptr_t>(k));
-    guarantee(false, "Memobj map does not have ptr");
-  }
+  guarantee(rval == 1, "Memobj map does not have ptr: 0x%x",
+                        reinterpret_cast<uintptr_t>(k));
 }
 
 amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) {
@@ -328,11 +325,8 @@ void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) {
 void MemObjMap::RemoveVirtualMemObj(const void* k) {
   amd::ScopedLock lock(AllocatedLock_);
   auto rval = VirtualMemObjMap_.erase(reinterpret_cast<uintptr_t>(k));
-  if (rval != 1) {
-    DevLogPrintfError("Virtual Memobj map does not have ptr: 0x%x",
-                      reinterpret_cast<uintptr_t>(k));
-    guarantee(false, "VirtualMemobj map does not have ptr");
-  }
+  guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x",
+                       reinterpret_cast<uintptr_t>(k));
 }
 
 amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) {
diff --git a/device/device.hpp b/device/device.hpp
index 64d67465..3e65a331 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -89,7 +89,6 @@ class SvmFillMemoryCommand;
 class SvmMapMemoryCommand;
 class SvmUnmapMemoryCommand;
 class SvmPrefetchAsyncCommand;
-class TransferBufferFileCommand;
 class StreamOperationCommand;
 class VirtualMapCommand;
 class ExternalSemaphoreCmd;
@@ -154,7 +153,6 @@ enum OclExtensions {
   ClKhrD3d9Sharing,
 #endif
   ClKhrImage2dFromBuffer,
-  ClAmdSemaphore,
   ClAMDBusAddressableMemory,
   ClAMDC11Atomics,
   ClKhrSpir,
@@ -163,8 +161,6 @@ enum OclExtensions {
   ClKhrDepthImages,
   ClKhrMipMapImage,
   ClKhrMipMapImageWrites,
-  ClKhrIlProgram,
-  ClAMDLiquidFlash,
   ClAmdCopyBufferP2P,
   ClAmdAssemblyProgram,
 #if defined(_WIN32)
@@ -200,7 +196,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_dx9_media_sharing ",
 #endif
                                             "cl_khr_image2d_from_buffer ",
-                                            "",
                                             "cl_amd_bus_addressable_memory ",
                                             "cl_amd_c11_atomics ",
                                             "cl_khr_spir ",
@@ -209,8 +204,6 @@ static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                             "cl_khr_depth_images ",
                                             "cl_khr_mipmap_image ",
                                             "cl_khr_mipmap_image_writes ",
-                                            "",
-                                            "cl_amd_liquid_flash ",
                                             "cl_amd_copy_buffer_p2p ",
                                             "cl_amd_assembly_program ",
 #if defined(_WIN32)
@@ -1241,9 +1234,6 @@ class VirtualDevice : public amd::HeapObject {
   /// Optional extensions
   virtual void submitSignal(amd::SignalCommand& cmd) = 0;
   virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0;
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-    ShouldNotReachHere();
-  }
   virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
     ShouldNotReachHere();
   }
@@ -1270,6 +1260,9 @@ class VirtualDevice : public amd::HeapObject {
   //! Returns fence state of the VirtualGPU
   virtual bool isFenceDirty() const = 0;
 
+  //! Resets fence state of the VirtualGPU
+  virtual void resetFenceDirty() = 0;
+
  private:
   //! Disable default copy constructor
   VirtualDevice& operator=(const VirtualDevice&);
@@ -1796,9 +1789,14 @@ class Device : public RuntimeObject {
 
   // Returns the status of HW event, associated with amd::Event
   virtual bool IsHwEventReady(
-      const amd::Event& event,  //!< AMD event for HW status validation
-      bool wait = false         //!< If true then forces the event completion
-      ) const {
+      const amd::Event& event,    //!< AMD event for HW status validation
+      bool wait = false) const {  //!< If true then forces the event completion
+    return false;
+  };
+
+  // Returns the status of HW event, associated with amd::Event
+  virtual bool IsHwEventReadyForcedWait(
+      const amd::Event& event) const {  //!< AMD event for HW status validation
     return false;
   };
 
diff --git a/device/devprogram.cpp b/device/devprogram.cpp
index 5455d80c..f1a2cb2e 100644
--- a/device/devprogram.cpp
+++ b/device/devprogram.cpp
@@ -2977,7 +2977,7 @@ bool Program::runInitFiniKernel(kernel_kind_t kind) const {
   amd::HostQueue* queue = nullptr;
 
   for (const auto& i : kernels_) {
-    LogPrintfInfo("For Init/Fini: Kernel Name: %s", i.first.c_str());
+    ClPrint(amd::LOG_INFO, amd::LOG_INIT, "For Init/Fini: Kernel Name: %s", i.first.c_str());
     const auto &kernel = i.second;
     if ((kernel->isInitKernel() && kind == kernel_kind_t::InitKernel) ||
         (kernel->isFiniKernel() && kind == kernel_kind_t::FiniKernel)) {
diff --git a/device/devwavelimiter.cpp b/device/devwavelimiter.cpp
index beec1bce..deaadd98 100644
--- a/device/devwavelimiter.cpp
+++ b/device/devwavelimiter.cpp
@@ -328,6 +328,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
 // ================================================================================================
 void WaveLimiterManager::enable(bool isSupported) {
   if (fixed_ > 0) {
+    enable_ = GPU_WAVE_LIMIT_ENABLE;
     return;
   }
 
diff --git a/device/pal/paldevice.cpp b/device/pal/paldevice.cpp
index 75cd3c3e..9adfabff 100644
--- a/device/pal/paldevice.cpp
+++ b/device/pal/paldevice.cpp
@@ -108,6 +108,7 @@ static constexpr PalDevice supportedPalDevices[] = {
   {11, 0,  1,  Pal::GfxIpLevel::GfxIp11_0, "gfx1101",       Pal::AsicRevision::Navi32},
   {11, 0,  2,  Pal::GfxIpLevel::GfxIp11_0, "gfx1102",       Pal::AsicRevision::Navi33},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix1},
+  {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix2},
 };
 
 static std::tuple<const amd::Isa*, const char*> findIsa(Pal::AsicRevision asicRevision,
@@ -632,7 +633,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
     info_.cooperativeGroups_ = settings().enableCoopGroups_;
     info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
 
-    if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
+    if (amd::IS_HIP) {
+      info_.largeBar_ = false;
+    } else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
       info_.largeBar_ = true;
       ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled");
     }
@@ -2356,8 +2359,8 @@ void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
 }
 
 // ================================================================================================
-void Device::HiddenHeapAlloc() {
-  auto HeapAlloc = [this]() -> bool {
+void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
+  auto HeapAlloc = [this, &gpu]() -> bool {
     // Allocate initial heap for device memory allocator
     static constexpr size_t HeapBufferSize = 128 * Ki;
     heap_buffer_ = createMemory(HeapBufferSize);
@@ -2369,7 +2372,7 @@ void Device::HiddenHeapAlloc() {
       LogError("Heap buffer allocation failed!");
       return false;
     }
-    bool result = static_cast<const KernelBlitManager&>(xferMgr()).initHeap(
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
         heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
 
     return result;
diff --git a/device/pal/paldevice.hpp b/device/pal/paldevice.hpp
index 5b3b4af5..0226ea40 100644
--- a/device/pal/paldevice.hpp
+++ b/device/pal/paldevice.hpp
@@ -629,7 +629,7 @@ class Device : public NullDevice {
 #endif
 #endif
   //! Allocates hidden heap for device memory allocations
-  void HiddenHeapAlloc();
+  void HiddenHeapAlloc(const VirtualGPU& gpu);
 
  private:
   static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
diff --git a/device/pal/palgpuopen.cpp b/device/pal/palgpuopen.cpp
index 7c6357e9..e5f7dd2e 100644
--- a/device/pal/palgpuopen.cpp
+++ b/device/pal/palgpuopen.cpp
@@ -54,9 +54,7 @@ RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
       se_mask_(0),
       perf_counter_mem_limit_(0),
       perf_counter_frequency_(0),
-      trace_enabled_(false),
-      inst_tracing_enabled_(false),
-      perf_counters_enabled_(false) {
+      value_(0) {
   memset(&trace_, 0, sizeof(trace_));
 }
 
@@ -176,6 +174,8 @@ bool RgpCaptureMgr::Update(Pal::IPlatform* platform) {
     PostDeviceCreate();
   }
 
+  static_vm_id_ = device_.properties().gfxipProperties.flags.supportStaticVmid;
+
   return result;
 }
 
@@ -189,12 +189,12 @@ bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
   // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
   // it may be optional for Vulkan, but we provide it anyway if available).
   Pal::KernelContextInfo kernelContextInfo = {};
-
   Pal::Result palResult = iQueue->QueryKernelContextInfo(&kernelContextInfo);
 
   // Ensure we've acquired the debug VMID (note that some platforms do not
   // implement this function, so don't fail the whole trace if so)
   *debug_vmid = kernelContextInfo.flags.hasDebugVmid;
+  assert((static_vm_id_ || *debug_vmid) && "Can't capture multiple queues!");
 
   // Register the queue with the GPA session class for timed queue operation support.
   if (trace_.gpa_session_->RegisterTimedQueue(
@@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
           // continue until we find the right queue...
         } else if (Pal::Result::Success == res) {
           trace_.sqtt_disp_count_ = 0;
+          // Stop the trace and save the result. Currently runtime can't delay upload in HIP,
+          // because default stream doesn't have explicit destruction and
+          // OS kills all threads on exit without any notification. That includes PAL RGP threads.
+          {
+            if (trace_.status_ == TraceStatus::WaitingForSqtt) {
+              auto result = EndRGPTrace(gpu);
+            }
+            // Check if runtime is waiting for the final trace results
+            if (trace_.status_ == TraceStatus::WaitingForResults) {
+              // If results are ready, then finish the trace
+              if (CheckForTraceResults() == Pal::Result::Success) {
+                FinishRGPTrace(gpu, false);
+              }
+            }
+          }
         } else {
           FinishRGPTrace(gpu, true);
         }
@@ -517,11 +532,17 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
     }
   }
 
-  // Notify the RGP server that we are starting a trace
-  if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
-    result = Pal::Result::ErrorUnknown;
+  if (static_vm_id_) {
+    result = device_.iDev()->SetStaticVmidMode(true);
+    assert(result == Pal::Result::Success && "Static VM ID setup failed!");
   }
 
+  if (result == Pal::Result::Success) {
+    // Notify the RGP server that we are starting a trace
+    if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
+      result = Pal::Result::ErrorUnknown;
+    }
+  }
   // Tell the GPA session class we're starting a trace
   if (result == Pal::Result::Success) {
     GpuUtil::GpaSessionBeginInfo info = {};
@@ -707,6 +728,7 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
     return;
   }
 
+  auto disp_count = trace_.sqtt_disp_count_;
   // Finish the trace if the queue was destroyed before OCL reached
   // the number of captured dispatches
   if (trace_.sqtt_disp_count_ != 0) {
@@ -736,9 +758,18 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
   } else {
     rgp_server_->EndTrace();
   }
+
+  if (static_vm_id_) {
+    auto result = device_.iDev()->SetStaticVmidMode(false);
+    assert(result == Pal::Result::Success && "Static VM ID setup failed!");
+  }
+
   if (trace_.gpa_session_ != nullptr) {
     trace_.gpa_session_->Reset();
   }
+  // If applicaiton exits, then Windows kills all threads and
+  // RGP can't finish data write into a file.
+  amd::Os::sleep(10 * disp_count + 500);
   // Reset tracing state to idle
   trace_.prepared_disp_count_ = 0;
   trace_.sqtt_disp_count_ = 0;
diff --git a/device/pal/palgpuopen.hpp b/device/pal/palgpuopen.hpp
index ddae799b..33e9d35d 100644
--- a/device/pal/palgpuopen.hpp
+++ b/device/pal/palgpuopen.hpp
@@ -410,9 +410,10 @@ class RgpCaptureMgr {
 
   union {
     struct {
-      uint32_t trace_enabled_ : 1;      // True if tracing is currently enabled (master flag)
-      uint32_t inst_tracing_enabled_;   // Enable instruction-level SQTT tokens
-      uint32_t perf_counters_enabled_;  // True if perf counters are enabled
+      uint32_t trace_enabled_: 1;         // True if tracing is currently enabled (master flag)
+      uint32_t inst_tracing_enabled_: 1;  // Enable instruction-level SQTT tokens
+      uint32_t perf_counters_enabled_: 1; // True if perf counters are enabled
+      uint32_t static_vm_id_: 1;          // Static VM ID can be used for capture
     };
     uint32_t value_;
   };
diff --git a/device/pal/palkernel.cpp b/device/pal/palkernel.cpp
index 19c6d9fe..035a6288 100644
--- a/device/pal/palkernel.cpp
+++ b/device/pal/palkernel.cpp
@@ -362,7 +362,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
       case amd::KernelParameterDescriptor::HiddenHeap:
         // Allocate hidden heap for HIP applications only
         if ((amd::IS_HIP) && (palDevice().HeapBuffer() == nullptr)) {
-          const_cast<Device&>(palDevice()).HiddenHeapAlloc();
+          const_cast<Device&>(palDevice()).HiddenHeapAlloc(gpu);
         }
         if (palDevice().HeapBuffer() != nullptr) {
           // Add heap pointer to the code
diff --git a/device/pal/palsettings.cpp b/device/pal/palsettings.cpp
index 1f4dc8e9..080654c1 100644
--- a/device/pal/palsettings.cpp
+++ b/device/pal/palsettings.cpp
@@ -78,9 +78,10 @@ Settings::Settings() {
 
   // By default use host blit
   blitEngine_ = BlitEngineHost;
-  pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
+  pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
+  size_t defaultMinXferSize = amd::IS_HIP ? 128: 4;
   pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
-    ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
+    ? defaultMinXferSize * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
 
   // Disable FP_FAST_FMA defines by default
   reportFMAF_ = false;
@@ -205,6 +206,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
     case Pal::AsicRevision::Navi31:
     // Fall through for Navi2x ...
     case Pal::AsicRevision::Phoenix1:
+    case Pal::AsicRevision::Phoenix2:
     case Pal::AsicRevision::Raphael:
     case Pal::AsicRevision::Rembrandt:
     case Pal::AsicRevision::Navi24:
diff --git a/device/pal/palvirtual.cpp b/device/pal/palvirtual.cpp
index 64b5bfa6..624379d1 100644
--- a/device/pal/palvirtual.cpp
+++ b/device/pal/palvirtual.cpp
@@ -1072,15 +1072,15 @@ bool VirtualGPU::allocHsaQueueMem() {
 }
 
 VirtualGPU::~VirtualGPU() {
+  // Not safe to remove a queue. So lock the device
+  amd::ScopedLock k(dev().lockAsyncOps());
+  amd::ScopedLock lock(dev().vgpusAccess());
+
   // Destroy RGP trace
   if (rgpCaptureEna()) {
     dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
   }
 
-  // Not safe to remove a queue. So lock the device
-  amd::ScopedLock k(dev().lockAsyncOps());
-  amd::ScopedLock lock(dev().vgpusAccess());
-
   while (!freeCbQueue_.empty()) {
     auto cb = freeCbQueue_.front();
     delete cb;
@@ -3741,49 +3741,7 @@ bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) {
   return false;
 }
 
-void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-  size_t copySize = cmd.size()[0];
-  size_t fileOffset = cmd.fileOffset();
-  Memory* mem = dev().getGpuMemory(&cmd.memory());
-  uint idx = 0;
-
-  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
-         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
-  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);
-
-  if (writeBuffer) {
-    size_t dstOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
-      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      dstSize = std::min(dstSize, copySize);
-      void* dstBuffer = staging->cpuMap(*this);
-      staging->cpuUnmap(*this);
-
-      blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
-      flushDMA(staging->getGpuEvent(*this)->engineId_);
-      fileOffset += dstSize;
-      dstOffset += dstSize;
-      copySize -= dstSize;
-    }
-  } else {
-    size_t srcOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
-      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      srcSize = std::min(srcSize, copySize);
-      blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);
-
-      void* srcBuffer = staging->cpuMap(*this);
-      staging->cpuUnmap(*this);
-
-      fileOffset += srcSize;
-      srcOffset += srcSize;
-      copySize -= srcSize;
-    }
-  }
-}
-
+// ================================================================================================
 void* VirtualGPU::getOrCreateHostcallBuffer() {
   if (hostcallBuffer_ != nullptr) {
     return hostcallBuffer_;
diff --git a/device/pal/palvirtual.hpp b/device/pal/palvirtual.hpp
index 1bbfe8e9..74fb039f 100644
--- a/device/pal/palvirtual.hpp
+++ b/device/pal/palvirtual.hpp
@@ -345,7 +345,6 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
   virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
   virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
   virtual void submitVirtualMap(amd::VirtualMapCommand& cmd);
   virtual void submitStreamOperation(amd::StreamOperationCommand& cmd);
   void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd);
@@ -360,6 +359,8 @@ class VirtualGPU : public device::VirtualDevice {
 
   bool isFenceDirty() const { return false; }
 
+  void resetFenceDirty() {}
+
   //! Returns GPU device object associated with this kernel
   const Device& dev() const { return gpuDevice_; }
 
diff --git a/device/rocm/pro/lnxheaders.h b/device/rocm/pro/lnxheaders.h
deleted file mode 100644
index 8c9fbe14..00000000
--- a/device/rocm/pro/lnxheaders.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-// NOTE: Some of the Linux driver stack's headers don't wrap their C-style interface names in 'extern "C" { ... }'
-// blocks when building with a C++ compiler, so we need to add that ourselves.
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include <amdgpu.h>
-#include <amdgpu_drm.h>
-#include <xf86drm.h>
-#include <xf86drmMode.h>
-
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <dlfcn.h>
-
-constexpr int32_t InvalidFd = -1; // value representing a invalid file descriptor for Linux
-
-#if __cplusplus
-} // extern "C"
-#endif
diff --git a/device/rocm/pro/prodevice.cpp b/device/rocm/pro/prodevice.cpp
deleted file mode 100644
index 1e366cd2..00000000
--- a/device/rocm/pro/prodevice.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "hsa/hsa_ext_amd.h"
-#include "lnxheaders.h"
-#include "prodevice.hpp"
-#include "amdgpu_drm.h"
-
-namespace roc {
-
-constexpr uint32_t kMaxDevices  = 32;
-constexpr uint32_t kAtiVendorId = 0x1002;
-
-void*      ProDevice::lib_drm_handle_ = nullptr;
-bool       ProDevice::initialized_ = false;
-drm::Funcs ProDevice::funcs_;
-
-IProDevice* IProDevice::Init(uint32_t bus, uint32_t device, uint32_t func)
-{
-  // Make sure DRM lib is initialized
-  if (!ProDevice::DrmInit()) {
-    return nullptr;
-  }
-
-  ProDevice* pro_device = new ProDevice();
-
-  if (pro_device == nullptr || !pro_device->Create(bus, dev, func)) {
-    delete pro_device;
-    return nullptr;
-  }
-  return pro_device;
-}
-
-ProDevice::~ProDevice() {
-  delete alloc_ops_;
-
-  if (dev_handle_ != nullptr) {
-    Funcs().AmdgpuDeviceDeinitialize(dev_handle_);
-  }
-  if (file_desc_ > 0) {
-    close(file_desc_);
-  }
-}
-
-bool ProDevice::DrmInit()
-{
-  if (initialized_ == false) {
-    // Find symbols in libdrm_amdgpu.so.1
-    lib_drm_handle_ = dlopen("libdrm_amdgpu.so.1", RTLD_NOW);
-    if (lib_drm_handle_ == nullptr) {
-      return false;
-    } else {
-      funcs_.DrmGetDevices = reinterpret_cast<drm::DrmGetDevices>(dlsym(
-                             lib_drm_handle_,
-                             "drmGetDevices"));
-      if (funcs_.DrmGetDevices == nullptr) return false;
-      funcs_.AmdgpuDeviceInitialize = reinterpret_cast<drm::AmdgpuDeviceInitialize>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_device_initialize"));
-      if (funcs_.AmdgpuDeviceInitialize == nullptr) return false;
-      funcs_.AmdgpuDeviceDeinitialize = reinterpret_cast<drm::AmdgpuDeviceDeinitialize>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_device_deinitialize"));
-      if (funcs_.AmdgpuDeviceDeinitialize == nullptr) return false;
-      funcs_.AmdgpuQueryGpuInfo = reinterpret_cast<drm::AmdgpuQueryGpuInfo>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_query_gpu_info"));
-      if (funcs_.AmdgpuQueryGpuInfo == nullptr) return false;
-      funcs_.AmdgpuQueryInfo = reinterpret_cast<drm::AmdgpuQueryInfo>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_query_info"));
-      if (funcs_.AmdgpuQueryInfo == nullptr) return false;
-      funcs_.AmdgpuBoAlloc = reinterpret_cast<drm::AmdgpuBoAlloc>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_alloc"));
-      if (funcs_.AmdgpuBoAlloc == nullptr) return false;
-      funcs_.AmdgpuBoExport = reinterpret_cast<drm::AmdgpuBoExport>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_export"));
-      if (funcs_.AmdgpuBoExport == nullptr) return false;
-      funcs_.AmdgpuBoFree = reinterpret_cast<drm::AmdgpuBoFree>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_free"));
-      if (funcs_.AmdgpuBoFree == nullptr) return false;
-      funcs_.AmdgpuBoCpuMap = reinterpret_cast<drm::AmdgpuBoCpuMap>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_cpu_map"));
-      if (funcs_.AmdgpuBoCpuMap == nullptr) return false;
-      funcs_.AmdgpuBoCpuUnmap = reinterpret_cast<drm::AmdgpuBoCpuUnmap>(dlsym(
-                             lib_drm_handle_,
-                             "amdgpu_bo_cpu_unmap"));
-      if (funcs_.AmdgpuBoCpuUnmap == nullptr) return false;
-    }
-  }
-
-  initialized_ = true;
-  return true;
-}
-
-#ifndef AMDGPU_CAPABILITY_SSG_FLAG
-#define AMDGPU_CAPABILITY_SSG_FLAG 4
-#endif
-
-// ================================================================================================
-// Open drm device and initialize it. And also get the drm information.
-bool ProDevice::Create(uint32_t bus, uint32_t device, uint32_t func) {
-  drmDevicePtr  devices[kMaxDevices] = { };
-  int32_t device_count = Funcs().DrmGetDevices(devices, kMaxDevices);
-  bool    result = false;
-
-  for (int32_t i = 0; i < device_count; i++) {
-    // Check if the device vendor is AMD
-    if (devices[i]->deviceinfo.pci->vendor_id != kAtiVendorId) {
-      continue;
-    }
-    if ((devices[i]->businfo.pci->bus == bus) &&
-        (devices[i]->businfo.pci->dev == device) &&
-        (devices[i]->businfo.pci->func == func)) {
-
-      // pDevices[i]->nodes[DRM_NODE_PRIMARY];
-      // Using render node here so that we can do the off-screen rendering without authentication
-      file_desc_ = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR, 0);
-
-      if (file_desc_ > 0) {
-        void* data, *file, *cap;
-
-        // Initialize the admgpu device.
-        if (Funcs().AmdgpuDeviceInitialize(file_desc_, &major_ver_,
-                                           &minor_ver_, &dev_handle_) == 0) {
-          uint32_t version = 0;
-          // amdgpu_query_gpu_info will never fail only if it is initialized
-          Funcs().AmdgpuQueryGpuInfo(dev_handle_, &gpu_info_);
-
-          drm_amdgpu_capability cap = {};
-          Funcs().AmdgpuQueryInfo(dev_handle_, AMDGPU_INFO_CAPABILITY, sizeof(drm_amdgpu_capability), &cap);
-
-          // Check if DGMA and SSG are available
-          if ((cap.flag & (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) ==
-              (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) {
-            result = true;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  if (result) {
-    alloc_ops_ = new amd::Monitor("DGMA mem alloc lock", true);
-    if (nullptr == alloc_ops_) {
-      return true;
-    }
-  }
-
-  return result;
-}
-
-void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const
-{
-  amd::ScopedLock l(alloc_ops_);
-  void* ptr = nullptr;
-  amdgpu_bo_handle buf_handle = 0;
-  amdgpu_bo_alloc_request req = {0};
-  *host_ptr = nullptr;
-
-  req.alloc_size = size;
-  req.phys_alignment = 64 * Ki;
-  req.preferred_heap = AMDGPU_GEM_DOMAIN_DGMA;
-
-  // Allocate buffer in DGMA heap
-  if (0 == Funcs().AmdgpuBoAlloc(dev_handle_, &req, &buf_handle)) {
-    amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
-    uint32_t shared_handle = 0;
-    // Find the base driver handle
-    if (0 == Funcs().AmdgpuBoExport(buf_handle, type, &shared_handle)) {
-      uint32_t  flags = 0;
-      size_t    buf_size = 0;
-      // Map memory object to HSA device
-      if (0 == hsa_amd_interop_map_buffer(1, &agent, shared_handle,
-                                          flags, &buf_size, &ptr, nullptr, nullptr)) {
-        // Ask GPUPro driver to provide CPU access to allocation
-        if (0 == Funcs().AmdgpuBoCpuMap(buf_handle, host_ptr)) {
-          allocs_.insert({ptr, {buf_handle, shared_handle}});
-        }
-        else {
-          hsa_amd_interop_unmap_buffer(ptr);
-          close(shared_handle);
-          Funcs().AmdgpuBoFree(buf_handle);
-        }
-      }
-      else {
-        close(shared_handle);
-        Funcs().AmdgpuBoFree(buf_handle);
-      }
-    }
-    else {
-      Funcs().AmdgpuBoFree(buf_handle);
-    }
-  }
-
-  return ptr;
-}
-
-void ProDevice::FreeDmaBuffer(void* ptr) const
-{
-  amd::ScopedLock l(alloc_ops_);
-  auto it = allocs_.find(ptr);
-  if (it != allocs_.end()) {
-    Funcs().AmdgpuBoCpuUnmap(it->second.first);
-    // Unmap memory from HSA device
-    hsa_amd_interop_unmap_buffer(ptr);
-    // Close shared handle
-    close(it->second.second);
-    int error = Funcs().AmdgpuBoFree(it->second.first);
-    allocs_.erase(it);
-  }
-}
-
-}
-
-#endif  // WITHOUT_HSA_BACKEND
-
diff --git a/device/rocm/pro/prodevice.hpp b/device/rocm/pro/prodevice.hpp
deleted file mode 100644
index 80ff3600..00000000
--- a/device/rocm/pro/prodevice.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "profuncs.hpp"
-#include "prodriver.hpp"
-#include "thread/monitor.hpp"
-#include <unordered_map>
-
-/*! \addtogroup HSA
- *  @{
- */
-
-//! HSA Device Implementation
-namespace roc {
-
-class ProDevice : public IProDevice {
-public:
-  static bool DrmInit();
-
-  ProDevice()
-    : file_desc_(0)
-    , major_ver_(0)
-    , minor_ver_(0)
-    , dev_handle_(nullptr)
-    , alloc_ops_(nullptr) {}
-  virtual ~ProDevice() override;
-
-  bool Create(uint32_t bus, uint32_t device, uint32_t func);
-
-  virtual void* AllocDmaBuffer(
-      hsa_agent_t agent, size_t size, void** host_ptr) const override;
-  virtual void FreeDmaBuffer(void* ptr) const override;
-  virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const override
-  {
-    *asic_id    = gpu_info_.asic_id;
-    *rev_id     = gpu_info_.pci_rev_id;
-  }
-
-private:
-  static void*          lib_drm_handle_;
-  static bool           initialized_;
-  static drm::Funcs     funcs_;
-  const drm::Funcs& Funcs() const { return funcs_; }
-
-  int32_t               file_desc_;   //!< File descriptor for the device
-  uint32_t              major_ver_;   //!< Major driver version
-  uint32_t              minor_ver_;   //!< Minor driver version
-  amdgpu_device_handle  dev_handle_;  //!< AMD gpu device handle
-  amdgpu_gpu_info       gpu_info_;    //!< GPU info structure
-  amdgpu_heap_info      heap_info_;   //!< Information about memory
-  mutable std::unordered_map<void*, std::pair<amdgpu_bo_handle, uint32_t>> allocs_; //!< Alloced memory mapping
-  amd::Monitor*         alloc_ops_;   //!< Serializes memory allocations/destructions
-};
-
-}  // namespace roc
-
-/**
- * @}
- */
-#endif /*WITHOUT_HSA_BACKEND*/
diff --git a/device/rocm/pro/prodriver.hpp b/device/rocm/pro/prodriver.hpp
deleted file mode 100644
index 819ade27..00000000
--- a/device/rocm/pro/prodriver.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "top.hpp"
-#include "hsa/hsa.h"
-
-/*! \addtogroup HSA
- *  @{
- */
-
-namespace roc {
-
-//! Pro Device Interface
-class IProDevice : public amd::HeapObject {
-public:
-  static IProDevice* Init(uint32_t bus, uint32_t device, uint32_t func);
-
-  virtual void* AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const = 0;
-  virtual void FreeDmaBuffer(void* ptr) const = 0;
-  virtual void GetAsicIdAndRevisionId(uint32_t* asic_id, uint32_t* rev_id) const = 0;
-
-  IProDevice() {}
-  virtual ~IProDevice() {}
-};
-
-}  // namespace roc
-
-/**
- * @}
- */
-#endif /*WITHOUT_HSA_BACKEND*/
diff --git a/device/rocm/pro/profuncs.hpp b/device/rocm/pro/profuncs.hpp
deleted file mode 100644
index e878df0c..00000000
--- a/device/rocm/pro/profuncs.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-namespace roc
-{
-namespace drm
-{
-typedef int (*DrmGetDevices)(
-            drmDevicePtr*     pDevices,
-            int               maxDevices);
-
-typedef int (*AmdgpuDeviceInitialize)(
-            int                       fd,
-            uint32_t*                 pMajorVersion,
-            uint32_t*                 pMinorVersion,
-            amdgpu_device_handle*     pDeviceHandle);
-
-typedef int (*AmdgpuDeviceDeinitialize)(
-            amdgpu_device_handle  hDevice);
-
-typedef int (*AmdgpuQueryGpuInfo)(
-            amdgpu_device_handle      hDevice,
-            struct amdgpu_gpu_info*   pInfo);
-
-typedef int (*AmdgpuQueryInfo)(
-            amdgpu_device_handle  hDevice,
-            unsigned              infoId,
-            unsigned              size,
-            void*                 pValue);
-
-typedef int (*AmdgpuBoAlloc)(
-            amdgpu_device_handle              hDevice,
-            struct amdgpu_bo_alloc_request*   pAllocBuffer,
-            amdgpu_bo_handle*                 pBufferHandle);
-
-typedef int (*AmdgpuBoExport)(
-            amdgpu_bo_handle              hBuffer,
-            enum amdgpu_bo_handle_type    type,
-            uint32_t*                     pFd);
-
-typedef int (*AmdgpuBoFree)(
-            amdgpu_bo_handle  hBuffer);
-
-typedef int (*AmdgpuBoCpuMap)(
-            amdgpu_bo_handle  hBuffer,
-            void**            ppCpuAddress);
-
-typedef int (*AmdgpuBoCpuUnmap)(
-            amdgpu_bo_handle  hBuffer);
-
-struct Funcs
-{
-    DrmGetDevices                     DrmGetDevices;
-    AmdgpuDeviceInitialize            AmdgpuDeviceInitialize;
-    AmdgpuDeviceDeinitialize          AmdgpuDeviceDeinitialize;
-    AmdgpuQueryGpuInfo                AmdgpuQueryGpuInfo;
-    AmdgpuQueryInfo                   AmdgpuQueryInfo;
-    AmdgpuBoAlloc                     AmdgpuBoAlloc;
-    AmdgpuBoExport                    AmdgpuBoExport;
-    AmdgpuBoFree                      AmdgpuBoFree;
-    AmdgpuBoCpuMap                    AmdgpuBoCpuMap;
-    AmdgpuBoCpuUnmap                  AmdgpuBoCpuUnmap;
-};
-
-} //namespace drm
-} //namespace roc
diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp
index f6b447ca..1bd6a0ff 100644
--- a/device/rocm/rocblit.cpp
+++ b/device/rocm/rocblit.cpp
@@ -457,7 +457,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
 
       // Copy memory line by line
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "HSA Asycn Copy Rect  wait_event=0x%zx, completion_signal=0x%zx",
+              "HSA Async Copy Rect  wait_event=0x%zx, completion_signal=0x%zx",
               (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
       hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
           &srcMem, &offset, &dim, agent, direction, wait_events.size(), wait_events.data(), active);
@@ -478,7 +478,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
 
           // Copy memory line by line
           ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-                  "HSA Asycn Copy wait_event=0x%zx, completion_signal=0x%zx",
+                  "HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx",
                   (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
           hsa_status_t status = hsa_amd_memory_async_copy(
               (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
@@ -668,9 +668,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
 
   // Use SDMA to transfer the data
   ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-          "HSA Asycn Copy dst=0x%zx, src=0x%zx, size=%d, wait_event=0x%zx, "
-          "completion_signal=0x%zx",
-          dst, src, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
+          "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, completion_signal=0x%zx",
+          dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
 
   status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
       size[0], wait_events.size(), wait_events.data(), active);
@@ -2182,7 +2181,7 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern
 // ================================================================================================
 bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                    const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
-                                   const amd::Coord3D& sizeIn, bool entire, 
+                                   const amd::Coord3D& sizeIn, bool entire,
                                    amd::CopyMetadata copyMetadata) const {
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
@@ -2682,35 +2681,20 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
 
   Memory* schedulerMem = dev().getRocMemory(schedulerParam);
   sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
-
-  sp->hidden_global_offset_x = 0;
-  sp->hidden_global_offset_y = 0;
-  sp->hidden_global_offset_z = 0;
   sp->thread_counter = 0;
   sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
   sp->complete_signal = schedulerSignal;
 
   hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
 
-  sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-                             (1 << HSA_PACKET_HEADER_BARRIER) |
-                             (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-                             (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-  sp->scheduler_aql.setup = 1;
-  sp->scheduler_aql.workgroup_size_x = 1;
-  sp->scheduler_aql.workgroup_size_y = 1;
-  sp->scheduler_aql.workgroup_size_z = 1;
-  sp->scheduler_aql.grid_size_x = threads;
-  sp->scheduler_aql.grid_size_y = 1;
-  sp->scheduler_aql.grid_size_z = 1;
-  sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle();
-  sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address;
-  sp->scheduler_aql.private_segment_size = 0;
-  sp->scheduler_aql.group_segment_size = 0;
+
   sp->vqueue_header = vqVM;
 
   sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
-  sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
+
+  if (dev().info().maxEngineClockFrequency_ > 0) {
+    sp->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
+  }
 
   // Use a device side global atomics to workaround the reliance of PCIe 3 atomics
   sp->write_index = hsa_queue_load_write_index_relaxed(schedulerQueue);
@@ -2721,7 +2705,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   address parameters = captureArguments(kernels_[Scheduler]);
 
   if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler],
-                                  parameters, nullptr)) {
+                                  parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) {
     return false;
   }
   releaseArguments(parameters);
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 961227bb..287ece39 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -40,9 +40,6 @@
 #include "device/rocm/rocmemory.hpp"
 #include "device/rocm/rocglinterop.hpp"
 #include "device/rocm/rocsignal.hpp"
-#ifdef WITH_AMDGPU_PRO
-#include "pro/prodriver.hpp"
-#endif
 #include "platform/sampler.hpp"
 
 #if defined(__clang__)
@@ -163,8 +160,6 @@ Device::Device(hsa_agent_t bkendDevice)
     , xferQueue_(nullptr)
     , xferRead_(nullptr)
     , xferWrite_(nullptr)
-    , pro_device_(nullptr)
-    , pro_ena_(false)
     , freeMem_(0)
     , vgpusAccess_("Virtual GPU List Ops Lock", true)
     , hsa_exclusive_gpu_access_(false)
@@ -218,9 +213,6 @@ void Device::checkAtomicSupport() {
 }
 
 Device::~Device() {
-#ifdef WITH_AMDGPU_PRO
-  delete pro_device_;
-#endif
   // Release cached map targets
   for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) {
     if ((*mapCache_)[i] != nullptr) {
@@ -244,17 +236,17 @@ Device::~Device() {
   }
 
   for (auto& it : queuePool_) {
-    for (auto& qIter : it) {
-      hsa_queue_t* queue = qIter.first;
-      auto& qInfo = qIter.second;
+    for (auto qIter = it.begin(); qIter != it.end(); ) {
+      hsa_queue_t* queue = qIter->first;
+      auto& qInfo = qIter->second;
       if (qInfo.hostcallBuffer_) {
         ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
-                qInfo.hostcallBuffer_, qIter.first);
+                qInfo.hostcallBuffer_, qIter->first);
         disableHostcalls(qInfo.hostcallBuffer_);
         context().svmFree(qInfo.hostcallBuffer_);
       }
       ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
-      it.erase(queue);
+      qIter = it.erase(qIter);
       hsa_queue_destroy(queue);
     }
   }
@@ -687,18 +679,6 @@ bool Device::create() {
   }
   info_.pciDomainID = pci_domain_id;
 
-#ifdef WITH_AMDGPU_PRO
-  // Create amdgpu-pro device interface for SSG support
-  pro_device_ = IProDevice::Init(
-      info_.deviceTopology_.pcie.bus,
-      info_.deviceTopology_.pcie.device,
-      info_.deviceTopology_.pcie.function);
-  if (pro_device_ != nullptr) {
-    pro_ena_ = true;
-    pro_device_->GetAsicIdAndRevisionId(&info_.pcieDeviceId_, &info_.pcieRevisionId_);
-  }
-#endif
-
   // Get Agent HDP Flush Register Memory
   hsa_amd_hdp_flush_t hdpInfo;
   if (HSA_STATUS_SUCCESS !=
@@ -1169,7 +1149,9 @@ bool Device::populateOCLDeviceConstants() {
 
   //TODO: add the assert statement for Raven
   if (!(isa().versionMajor() == 9 && isa().versionMinor() == 0 && isa().versionStepping() == 2)) {
-    assert(info_.maxEngineClockFrequency_ > 0);
+    if (info_.maxEngineClockFrequency_ <= 0) {
+      LogError("maxEngineClockFrequency_ is NOT positive!");
+    }
   }
 
   if (HSA_STATUS_SUCCESS !=
@@ -1642,7 +1624,7 @@ bool Device::populateOCLDeviceConstants() {
     LogError("HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS query failed.");
   }
 
-  LogPrintfInfo("HMM support: %d, xnack: %d, direct host access: %d\n",
+  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "HMM support: %d, xnack: %d, direct host access: %d\n",
     info_.hmmSupported_, info_.hmmCpuMemoryAccessible_, info_.hmmDirectHostAccess_);
 
   info_.globalCUMask_ = {};
@@ -2229,8 +2211,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me
     return false;
   }
 
-  // Pass the pointer and memory size to retrieve the handle
-  hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, amd::alignUp(*mem_size, alloc_granularity()),
+  hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, *mem_size,
                                          reinterpret_cast<hsa_amd_ipc_memory_t*>(handle));
 
   if (hsa_status != HSA_STATUS_SUCCESS) {
@@ -2331,7 +2312,6 @@ bool Device::IpcDetach (void* dev_ptr) const {
 // ================================================================================================
 void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
                        void* svmPtr) const {
-  constexpr bool kForceAllocation = true;
   amd::Memory* mem = nullptr;
 
   if (nullptr == svmPtr) {
@@ -2343,7 +2323,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
       return nullptr;
     }
 
-    if (!mem->create(nullptr, false, false, kForceAllocation)) {
+    if (!mem->create(nullptr)) {
       LogError("failed to create a svm hidden buffer!");
       mem->release();
       return nullptr;
@@ -2730,10 +2710,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
   return result;
 }
 
+// ================================================================================================
+bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const {
+  void* hw_event =
+      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
+  if (hw_event == nullptr) {
+    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
+    return false;
+  }
+  static constexpr bool Timeout = true;
+  return WaitForSignal<Timeout>(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, false, true);
+}
+
 // ================================================================================================
 bool Device::IsHwEventReady(const amd::Event& event, bool wait) const {
-  void* hw_event = (event.NotifyEvent() != nullptr) ?
-    event.NotifyEvent()->HwEvent() : event.HwEvent();
+  void* hw_event =
+      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
   if (hw_event == nullptr) {
     ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
     return false;
@@ -3210,7 +3202,9 @@ device::Signal* Device::createSignal() const {
 amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) {
   // Only create arena_mem_object if CPU memory is accessible from HMM
   // or if runtime received an interop from another ROCr's client
-  if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) {
+  hsa_amd_pointer_info_t ptr_info = {};
+  ptr_info.size = sizeof(hsa_amd_pointer_info_t);
+  if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) {
     return nullptr;
   }
 
@@ -3227,8 +3221,9 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size
   }
 
   // Calculate the offset of the pointer.
-  const void* dev_ptr = reinterpret_cast<void*>(arena_mem_obj_->getDeviceMemory(
-                          *arena_mem_obj_->getContext().devices()[0])->virtualAddress());
+  const void* dev_ptr = reinterpret_cast<void*>(
+      arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0])
+          ->virtualAddress());
   offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
 
   return arena_mem_obj_;
@@ -3242,20 +3237,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
 }
 
 // ================================================================================================
-bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
-  hsa_amd_pointer_info_t ptr_info = {};
-  ptr_info.size = sizeof(hsa_amd_pointer_info_t);
+bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
   // Query ptr type to see if it's a HMM allocation
-  hsa_status_t status = hsa_amd_pointer_info(
-    const_cast<void*>(dev_ptr), &ptr_info, nullptr, nullptr, nullptr);
+  hsa_status_t status =
+      hsa_amd_pointer_info(const_cast<void*>(dev_ptr), ptr_info, nullptr, nullptr, nullptr);
   // The call should never fail in ROCR, but just check for an error and continue
   if (status != HSA_STATUS_SUCCESS) {
     LogError("hsa_amd_pointer_info() failed");
   }
-  // Check if it's a legacy non-HMM allocation in ROCr
-  if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
-    if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
-                         reinterpret_cast<const_address>(ptr_info.agentBaseAddress)) > size)) {
+
+  // Return false for pinned memory. A true return may result in a race because
+  // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment
+  if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) {
+    return false;
+  }
+
+  if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
+    if ((size != 0) &&
+        ((reinterpret_cast<const_address>(dev_ptr) -
+          reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
       return false;
     }
     return true;
@@ -3264,8 +3264,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
 }
 
 // ================================================================================================
-void Device::HiddenHeapAlloc() {
-  auto HeapAllocZeroOut = [this]() -> bool {
+void Device::HiddenHeapAlloc(const VirtualGPU& gpu) {
+  auto HeapAllocZeroOut = [this, &gpu]() -> bool {
     // Allocate initial heap for device memory allocator
     static constexpr size_t HeapBufferSize = 128 * Ki;
     heap_buffer_ = createMemory(HeapBufferSize);
@@ -3277,7 +3277,7 @@ void Device::HiddenHeapAlloc() {
       LogError("Heap buffer allocation failed!");
       return false;
     }
-    bool result = static_cast<const KernelBlitManager&>(xferMgr()).initHeap(
+    bool result = static_cast<const KernelBlitManager&>(gpu.blitMgr()).initHeap(
         heap_buffer_, initial_heap_buffer_, HeapBufferSize, initial_heap_size_ / (2 * Mi));
 
     return result;
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index b3da3783..b36d21ef 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -75,7 +75,6 @@ class Memory;
 class Resource;
 class VirtualDevice;
 class PrintfDbg;
-class IProDevice;
 
 class ProfilingSignal : public amd::ReferenceCountedObject {
 public:
@@ -258,6 +257,7 @@ class NullDevice : public amd::Device {
       cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
+  virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; }
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
   virtual void ReleaseGlobalSignal(void* signal) const {}
 
@@ -443,6 +443,7 @@ class Device : public NullDevice {
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
+  virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const;
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
   virtual void ReleaseGlobalSignal(void* signal) const;
 
@@ -478,10 +479,6 @@ class Device : public NullDevice {
   //! Create internal blit program
   bool createBlitProgram();
 
-  // Returns AMD GPU Pro interfacs
-  const IProDevice& iPro() const { return *pro_device_; }
-  bool ProEna() const  { return pro_ena_; }
-
   // P2P agents avaialble for this device
   const std::vector<hsa_agent_t>& p2pAgents() const { return p2p_agents_; }
 
@@ -549,10 +546,10 @@ class Device : public NullDevice {
   const bool isFineGrainSupported() const;
 
   //! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
-  bool IsValidAllocation(const void* dev_ptr, size_t size) const;
+  bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info);
 
   //! Allocates hidden heap for device memory allocations
-  void HiddenHeapAlloc();
+  void HiddenHeapAlloc(const VirtualGPU& gpu);
 
  private:
   bool create();
@@ -598,8 +595,6 @@ class Device : public NullDevice {
 
   XferBuffers* xferRead_;   //!< Transfer buffers read
   XferBuffers* xferWrite_;  //!< Transfer buffers write
-  const IProDevice* pro_device_;  //!< AMDGPUPro device
-  bool  pro_ena_;           //!< Extra functionality with AMDGPUPro device, beyond ROCr
   std::atomic<size_t> freeMem_;   //!< Total of free memory available
   mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
   bool hsa_exclusive_gpu_access_;  //!< TRUE if current device was moved into exclusive GPU access mode
diff --git a/device/rocm/rocmemory.cpp b/device/rocm/rocmemory.cpp
index fbe6bd01..d267e0d0 100644
--- a/device/rocm/rocmemory.cpp
+++ b/device/rocm/rocmemory.cpp
@@ -37,9 +37,6 @@
 #include "platform/sampler.hpp"
 #include "amdocl/cl_gl_amd.hpp"
 #include "amdocl/cl_vk_amd.hpp"
-#ifdef WITH_AMDGPU_PRO
-#include "pro/prodriver.hpp"
-#endif
 
 namespace roc {
 
@@ -646,8 +643,9 @@ void Buffer::destroy() {
         if (memFlags & CL_MEM_ALLOC_HOST_PTR) {
           if (dev().info().hmmSupported_) {
             // AMD HMM path. Destroy system memory
-            amd::Os::uncommitMemory(deviceMemory_, size());
-            amd::Os::releaseMemory(deviceMemory_, size());
+            if (!(amd::Os::releaseMemory(deviceMemory_, size()))) {
+              ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n");
+            }
           } else {
             dev().hostFree(deviceMemory_, size());
           }
@@ -673,12 +671,6 @@ void Buffer::destroy() {
     return;
   }
 
-#ifdef WITH_AMDGPU_PRO
-  if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
-    dev().iPro().FreeDmaBuffer(deviceMemory_);
-    return;
-  }
-#endif
   if (deviceMemory_ != nullptr) {
     if (deviceMemory_ != owner()->getHostMem()) {
       // if they are identical, the host pointer will be
@@ -755,7 +747,6 @@ bool Buffer::create(bool alloc_local) {
             if (deviceMemory_ == NULL) {
               return false;
             }
-            amd::Os::commitMemory(deviceMemory_, size(), amd::Os::MEM_PROT_RW);
             // Currently HMM requires cirtain initial calls to mark sysmem allocation as
             // GPU accessible or prefetch memory into GPU
             if (!dev().SvmAllocInit(deviceMemory_, size())) {
@@ -797,7 +788,7 @@ bool Buffer::create(bool alloc_local) {
       } else {
         assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
         deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN) != 0);
+                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0);
       }
       owner()->setSvmPtr(deviceMemory_);
     } else {
diff --git a/device/rocm/rocurilocator.hpp b/device/rocm/rocurilocator.hpp
index 880b6c72..76451087 100644
--- a/device/rocm/rocurilocator.hpp
+++ b/device/rocm/rocurilocator.hpp
@@ -22,7 +22,7 @@
 #if defined(__clang__)
 #if __has_feature(address_sanitizer)
 #include "device/devurilocator.hpp"
-#include "hsa_ven_amd_loader.h"
+#include "hsa/hsa_ven_amd_loader.h"
 
 #include <vector>
 namespace roc {
diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index 56b6022f..f2850b55 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -78,13 +78,8 @@ static constexpr uint16_t kBarrierPacketHeader =
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
-static constexpr uint16_t kBarrierPacketAgentScopeHeader =
-    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
 static constexpr uint16_t kNopPacketHeader =
-    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
+    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
     (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
@@ -99,14 +94,16 @@ static constexpr uint16_t kBarrierPacketReleaseHeader =
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
 static constexpr uint16_t kBarrierVendorPacketHeader =
-    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
-static constexpr uint16_t kBarrierVendorPacketAgentScopeHeader =
-    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+static constexpr uint16_t kBarrierVendorPacketNopScopeHeader =
+    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
 
 static constexpr hsa_barrier_and_packet_t kBarrierAcquirePacket = {
     kBarrierPacketAcquireHeader, 0, 0, {{0}}, 0, {0}};
@@ -433,8 +430,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     prof_signal->ts_ = ts;
     ts->AddProfilingSignal(prof_signal);
     if (AMD_DIRECT_DISPATCH) {
-      bool enqueHandler= false;
-      uint32_t init_value = kInitSignalValueOne;
+      bool enqueHandler = false;
       enqueHandler = (ts->command().Callback() != nullptr ||
                       ts->command().GetBatchHead() != nullptr )  &&
                       !ts->command().CpuWaitRequested();
@@ -990,6 +986,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
 
+  fence_dirty_ = true;
   auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
   if (!skipSignal) {
@@ -1002,7 +999,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   }
 
   // Reset fence_dirty_ flag if we submit a barrier
-  fence_dirty_ = false;
+  if (cache_state == amd::Device::kCacheStateSystem) {
+    fence_dirty_ = false;
+  }
 
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
   hsa_barrier_and_packet_t* aql_loc =
@@ -1064,6 +1063,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
     }
   }
 
+  fence_dirty_ = true;
+  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
   if (completionSignal.handle == 0) {
     // Get active signal for current dispatch if profiling is necessary
     barrier_value_packet_.completion_signal =
@@ -1073,6 +1076,11 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
     barrier_value_packet_.completion_signal = completionSignal;
   }
 
+  // Reset fence_dirty_ flag if we submit a barrier
+  if (cache_state == amd::Device::kCacheStateSystem) {
+    fence_dirty_ = false;
+  }
+
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
   hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast<hsa_amd_barrier_value_packet_t*>(
@@ -1080,9 +1088,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
   *aql_loc = barrier_value_packet_;
   packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, rest);
 
-  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
-
   hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
 
   ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
@@ -1817,7 +1822,7 @@ bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memo
         realSize.c[0] *= elemSize;
       }
 
-      result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, 
+      result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin,
                                     realSize, entire, copyMetadata);
       break;
     }
@@ -2781,7 +2786,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     const amd::Kernel& kernel, const_address parameters, void* eventHandle,
-    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
+    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
+    hsa_kernel_dispatch_packet_t* aql_packet) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -2957,7 +2963,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
         case amd::KernelParameterDescriptor::HiddenHeap:
           // Allocate hidden heap for HIP applications only
           if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) {
-            const_cast<Device&>(dev()).HiddenHeapAlloc();
+            const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
           }
           if (dev().HeapBuffer() != nullptr) {
             // Add heap pointer to the code
@@ -3108,6 +3114,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
     }
 
+    // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
+    if (aql_packet != nullptr) {
+      *aql_packet = dispatchPacket;
+      aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+                           (1 << HSA_PACKET_HEADER_BARRIER) |
+                           (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+                           (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+      aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    }
+
     // Dispatch the packet
     if (!dispatchAqlPacket(
             &dispatchPacket, aqlHeaderWithOrder,
@@ -3242,11 +3258,11 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
       if (timestamp_ != nullptr) {
         const Settings& settings = dev().settings();
         int32_t releaseFlags = vcmd.getEventScope();
-        if (releaseFlags == Device::CacheState::kCacheStateAgent) {
+        if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
           if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
-            dispatchBarrierValuePacket(kBarrierVendorPacketAgentScopeHeader, true);
+            dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
           } else {
-            dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
+            dispatchBarrierPacket(kNopPacketHeader, false);
           }
         } else {
           // Submit a barrier with a cache flushes.
@@ -3353,55 +3369,10 @@ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
   return nullptr;
 }
 
+// ================================================================================================
 void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); }
 
-void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
-  // Make sure VirtualGPU has an exclusive access to the resources
-  amd::ScopedLock lock(execution());
-
-  size_t copySize = cmd.size()[0];
-  size_t fileOffset = cmd.fileOffset();
-  Memory* mem = dev().getRocMemory(&cmd.memory());
-  uint idx = 0;
-
-  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
-         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
-  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);
-
-  if (writeBuffer) {
-    size_t dstOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getRocMemory(&cmd.staging(idx));
-      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      dstSize = std::min(dstSize, copySize);
-      void* dstBuffer = staging->cpuMap(*this);
-
-      staging->cpuUnmap(*this);
-
-      bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
-      fileOffset += dstSize;
-      dstOffset += dstSize;
-      copySize -= dstSize;
-    }
-  } else {
-    size_t srcOffset = cmd.origin()[0];
-    while (copySize > 0) {
-      Memory* staging = dev().getRocMemory(&cmd.staging(idx));
-      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
-      srcSize = std::min(srcSize, copySize);
-      bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);
-
-      void* srcBuffer = staging->cpuMap(*this);
-
-      staging->cpuUnmap(*this);
-
-      fileOffset += srcSize;
-      srcOffset += srcSize;
-      copySize -= srcSize;
-    }
-  }
-}
-
+// ================================================================================================
 void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp
index 11a3670e..8c002658 100644
--- a/device/rocm/rocvirtual.hpp
+++ b/device/rocm/rocvirtual.hpp
@@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();
 
 // Active wait time out incase same sdma engine is used again,
 // then just wait instead of adding dependency wait signal.
-constexpr static uint64_t kSDMAEngineTimeout = 10;
+constexpr static uint64_t kForcedTimeout = 10;
 
 template <bool active_wait_timeout = false>
-inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) {
+inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) {
   if (hsa_signal_load_relaxed(signal) > 0) {
     uint64_t timeout = kTimeout100us;
     if (active_wait) {
@@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd
     }
     if (active_wait_timeout) {
       // If diff engine, wait to 10 ms. Otherwise no wait
-      timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
+      timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
       if (timeout == 0) {
         return false;
       }
@@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice {
                             const_address parameters,            //!< Parameters for the kernel
                             void* event_handle,  //!< Handle to OCL event for debugging
                             uint32_t sharedMemBytes = 0, //!< Shared memory size
-                            amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
+                            amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
+                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr  //!< Scheduler launch
                             );
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);
@@ -341,8 +342,6 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitSignal(amd::SignalCommand& cmd) {}
   virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}
 
-  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
-
   void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {}
   void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {}
 
@@ -408,6 +407,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
+  void resetFenceDirty() { fence_dirty_ = false; }
   // } roc OpenCL integration
  private:
   //! Dispatches a barrier with blocking HSA signals
diff --git a/platform/command.cpp b/platform/command.cpp
index cc35579a..35ec5113 100644
--- a/platform/command.cpp
+++ b/platform/command.cpp
@@ -436,13 +436,12 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
     profilingInfo_.clear();
     profilingInfo_.callback_ = nullptr;
     profilingInfo_.marker_ts_ = true;
-    setEventScope(amd::Device::kCacheStateSystem);
   }
   kernel_.retain();
 }
 
 void NDRangeKernelCommand::releaseResources() {
-  kernel_.parameters().release(parameters_, queue()->device());
+  kernel_.parameters().release(parameters_);
   DEBUG_ONLY(parameters_ = NULL);
   kernel_.release();
   Command::releaseResources();
@@ -726,71 +725,6 @@ bool ThreadTraceMemObjectsCommand::validateMemory() {
   return true;
 }
 
-void TransferBufferFileCommand::releaseResources() {
-  for (uint i = 0; i < NumStagingBuffers; ++i) {
-    if (NULL != staging_[i]) {
-      staging_[i]->release();
-    }
-  }
-
-  // Call the parent
-  OneMemoryArgCommand::releaseResources();
-}
-
-void TransferBufferFileCommand::submit(device::VirtualDevice& device) {
-  device::Memory* mem = memory_->getDeviceMemory(queue()->device());
-  if (memory_->getMemFlags() &
-      (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD)) {
-    void* srcDstBuffer = nullptr;
-    if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
-      // Lock protected multiple maps for persistent memory
-      amd::ScopedLock lock(mem->owner()->lockMemoryOps());
-      srcDstBuffer = mem->cpuMap(device);
-    } else {
-      srcDstBuffer = mem->cpuMap(device);
-    }
-    // Make HD transfer to the host accessible memory
-    bool writeBuffer(type() == CL_COMMAND_READ_SSG_FILE_AMD);
-    if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
-      // Lock protected multiple maps for persistent memory
-      amd::ScopedLock lock(mem->owner()->lockMemoryOps());
-      mem->cpuUnmap(device);
-    } else {
-      mem->cpuUnmap(device);
-    }
-  } else {
-    device.submitTransferBufferFromFile(*this);
-  }
-}
-
-bool TransferBufferFileCommand::validateMemory() {
-  // Check if the destination buffer has direct host access
-  if (!(memory_->getMemFlags() &
-        (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) {
-    // Allocate staging buffers
-    for (uint i = 0; i < NumStagingBuffers; ++i) {
-      staging_[i] = new (memory_->getContext())
-          Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize);
-      if (NULL == staging_[i] || !staging_[i]->create(nullptr)) {
-        DevLogPrintfError("Staging Create failed, Staging[%d]: 0x%x", i, staging_[i]);
-        return false;
-      }
-      device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device());
-      if (NULL == mem) {
-        LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize());
-        return false;
-      }
-    }
-  }
-
-  device::Memory* mem = memory_->getDeviceMemory(queue()->device());
-  if (NULL == mem) {
-    LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize());
-    return false;
-  }
-  return true;
-}
-
 bool CopyMemoryP2PCommand::validateMemory() {
   amd::Device* queue_device = &queue()->device();
 
diff --git a/platform/command.hpp b/platform/command.hpp
index 7022ec3d..45d9e365 100644
--- a/platform/command.hpp
+++ b/platform/command.hpp
@@ -1642,66 +1642,6 @@ class SvmUnmapMemoryCommand : public Command {
   void* svmPtr() const { return svmPtr_; }
 };
 
-/*! \brief      A generic transfer memory from/to file command.
- *
- *  \details    Currently supports buffers only. Buffers
- *              are treated as 1D structures so origin_[0] and size_[0]
- *              are equivalent to offset_ and count_ respectively.
- */
-class TransferBufferFileCommand : public OneMemoryArgCommand {
- public:
-  static constexpr uint NumStagingBuffers = 2;
-  static constexpr size_t StagingBufferSize = 4 * Mi;
-  static constexpr uint StagingBufferMemType = CL_MEM_USE_PERSISTENT_MEM_AMD;
-
- protected:
-  const Coord3D origin_;                     //!< Origin of the region to write to
-  const Coord3D size_;                       //!< Size of the region to write to
-  LiquidFlashFile* file_;                    //!< The file object for data read
-  size_t fileOffset_;                        //!< Offset in the file for data read
-  amd::Memory* staging_[NumStagingBuffers];  //!< Staging buffers for transfer
-
- public:
-  TransferBufferFileCommand(cl_command_type type, HostQueue& queue,
-                            const EventWaitList& eventWaitList, Memory& memory,
-                            const Coord3D& origin, const Coord3D& size, LiquidFlashFile* file,
-                            size_t fileOffset)
-      : OneMemoryArgCommand(queue, type, eventWaitList, memory),
-        origin_(origin),
-        size_(size),
-        file_(file),
-        fileOffset_(fileOffset) {
-    // Sanity checks
-    assert(size.c[0] > 0 && "invalid");
-    for (uint i = 0; i < NumStagingBuffers; ++i) {
-      staging_[i] = NULL;
-    }
-  }
-
-  virtual void releaseResources();
-
-  virtual void submit(device::VirtualDevice& device);
-
-  //! Return the memory object to write to
-  Memory& memory() const { return *memory_; }
-
-  //! Return the host memory to read from
-  LiquidFlashFile* file() const { return file_; }
-
-  //! Returns file offset
-  size_t fileOffset() const { return fileOffset_; }
-
-  //! Return the region origin
-  const Coord3D& origin() const { return origin_; }
-  //! Return the region size
-  const Coord3D& size() const { return size_; }
-
-  //! Return the staging buffer for transfer
-  Memory& staging(uint i) const { return *staging_[i]; }
-
-  bool validateMemory();
-};
-
 /*! \brief      A P2P copy memory command
  *
  *  \details    Used for buffers only. Backends are expected
diff --git a/platform/commandqueue.cpp b/platform/commandqueue.cpp
index e9e56cb8..ce8ba4ae 100644
--- a/platform/commandqueue.cpp
+++ b/platform/commandqueue.cpp
@@ -66,8 +66,8 @@ bool HostQueue::terminate() {
       marker->awaitCompletion();
       marker->release();
     }
-    thread_.acceptingCommands_ = false;
     thread_.Release();
+    thread_.acceptingCommands_ = false;
   } else {
     if (Os::isThreadAlive(thread_)) {
       Command* marker = nullptr;
@@ -120,7 +120,7 @@ void HostQueue::finish() {
       return;
     }
   }
-  if (nullptr == command || vdev()->isHandlerPending()) {
+  if (nullptr == command || vdev()->isHandlerPending() || vdev()->isFenceDirty()) {
     if (nullptr != command) {
       command->release();
     }
diff --git a/platform/commandqueue.hpp b/platform/commandqueue.hpp
index 863f8eca..f98332bf 100644
--- a/platform/commandqueue.hpp
+++ b/platform/commandqueue.hpp
@@ -162,13 +162,13 @@ class HostQueue : public CommandQueue {
     Thread()
         : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE, !AMD_DIRECT_DISPATCH),
           acceptingCommands_(false),
-          virtualDevice_(NULL) {}
+          virtualDevice_(nullptr) {}
 
     //! The command queue thread entry point.
     void run(void* data) {
       HostQueue* queue = static_cast<HostQueue*>(data);
       virtualDevice_ = queue->device().createVirtualDevice(queue);
-      if (virtualDevice_ != NULL) {
+      if (virtualDevice_ != nullptr) {
         queue->loop(virtualDevice_);
         Release();
       } else {
diff --git a/platform/context.cpp b/platform/context.cpp
index dbb46c38..965269e8 100644
--- a/platform/context.cpp
+++ b/platform/context.cpp
@@ -34,14 +34,6 @@
 #include "CL/cl_dx9_media_sharing.h"
 #endif  //_WIN32
 
-#ifndef WITH_LIQUID_FLASH
-#if (!defined(BUILD_HSA_TARGET) && defined(WITH_HSA_DEVICE) && \
-      defined(WITH_AMDGPU_PRO)) || defined(_WIN32)
-#define WITH_LIQUID_FLASH 1
-#include "lf.h"
-#endif
-#endif
-
 namespace amd {
 
 Context::Context(const std::vector<Device*>& devices, const Info& info)
@@ -91,10 +83,6 @@ Context::~Context() {
 
   delete[] properties_;
   delete glenv_;
-
-#if WITH_LIQUID_FLASH
-  lfTerminate();
-#endif
 }
 
 int Context::checkProperties(const cl_context_properties* properties, Context::Info* info) {
@@ -315,10 +303,6 @@ int Context::create(const intptr_t* properties) {
     }
   }
 
-#if WITH_LIQUID_FLASH
-  lfInit();
-#endif
-
   return result;
 }
 
diff --git a/platform/kernel.cpp b/platform/kernel.cpp
index 80cb1a12..52d4936a 100644
--- a/platform/kernel.cpp
+++ b/platform/kernel.cpp
@@ -253,7 +253,7 @@ bool KernelParameters::boundToSvmPointer(const Device& device, const_address cap
   return svmBound[index];
 }
 
-void KernelParameters::release(address mem, const amd::Device& device) const {
+void KernelParameters::release(address mem) const {
   if (mem == nullptr) {
     // nothing to do!
     return;
diff --git a/platform/kernel.hpp b/platform/kernel.hpp
index e46fc5a7..8cb3b7f7 100644
--- a/platform/kernel.hpp
+++ b/platform/kernel.hpp
@@ -215,7 +215,7 @@ class KernelParameters : protected HeapObject {
   //! Capture the state of the parameters and return the stack base pointer.
   address capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error);
   //! Release the captured state of the parameters.
-  void release(address parameters, const amd::Device& device) const;
+  void release(address parameters) const;
 
   //! Allocate memory for this instance as well as the required storage for
   //  the values_, defined_, and rawPointer_ arrays.
diff --git a/platform/memory.hpp b/platform/memory.hpp
index 34ea71f8..ba89e18c 100644
--- a/platform/memory.hpp
+++ b/platform/memory.hpp
@@ -41,7 +41,7 @@
 #define ROCCLR_MEM_HSA_SIGNAL_MEMORY                (1u << 30)
 #define ROCCLR_MEM_INTERNAL_MEMORY                  (1u << 29)
 #define CL_MEM_VA_RANGE_AMD                         (1u << 28)
-#define ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN            (1u << 27)
+#define ROCCLR_MEM_HSA_UNCACHED                     (1u << 27)
 
 namespace device {
 class Memory;
@@ -663,42 +663,6 @@ class SvmBuffer : AllStatic {
   static Monitor AllocatedLock_;
 };
 
-#ifndef CL_COMMAND_WRITE_SSG_FILE_AMD
-#define CL_COMMAND_WRITE_SSG_FILE_AMD 2
-#endif
-#ifndef CL_COMMAND_READ_SSG_FILE_AMD
-#define CL_COMMAND_READ_SSG_FILE_AMD 1
-#endif
-#ifndef cl_file_flags_amd
-typedef uint32_t cl_file_flags_amd;
-#endif
-    //! Liquid flash extension
-class LiquidFlashFile : public RuntimeObject {
- private:
-  std::wstring name_;
-  cl_file_flags_amd flags_;
-  void* handle_;
-  uint32_t blockSize_;
-  uint64_t fileSize_;
-
- public:
-  LiquidFlashFile(const wchar_t* name, cl_file_flags_amd flags)
-      : name_(name), flags_(flags), handle_(NULL), blockSize_(0), fileSize_(0) {}
-
-  ~LiquidFlashFile();
-
-  bool open();
-  void close();
-
-  uint32_t blockSize() const { return blockSize_; };
-  uint64_t fileSize() const { return fileSize_; };
-
-  bool transferBlock(bool read, void* dst, uint64_t bufferSize, uint64_t fileOffset,
-                     uint64_t bufferOffset, uint64_t size) const;
-
-  virtual ObjectType objectType() const { return ObjectTypeLiquidFlashFile; }
-};
-
 class ArenaMemory: public Buffer {
 public:
   ArenaMemory(Context& context)
diff --git a/platform/object.hpp b/platform/object.hpp
index 9fca95ab..0f5ed636 100644
--- a/platform/object.hpp
+++ b/platform/object.hpp
@@ -41,9 +41,7 @@
 #define AMD_CL_TYPES_DO(F)                                                                         \
   F(cl_counter_amd, Counter)                                                                       \
   F(cl_perfcounter_amd, PerfCounter)                                                               \
-  F(cl_threadtrace_amd, ThreadTrace)                                                               \
-  F(cl_file_amd, LiquidFlashFile)
-
+  F(cl_threadtrace_amd, ThreadTrace)
 
 #define CL_TYPES_DO(F)                                                                             \
   KHR_CL_TYPES_DO(F)                                                                               \
@@ -144,7 +142,6 @@ class RuntimeObject : public ReferenceCountedObject, public ICDDispatchedObject
     ObjectTypeQueue = 8,
     ObjectTypeSampler = 9,
     ObjectTypeThreadTrace = 10,
-    ObjectTypeLiquidFlashFile = 11
   };
 
   virtual ObjectType objectType() const = 0;
diff --git a/utils/debug.cpp b/utils/debug.cpp
index fc607564..cdda2516 100644
--- a/utils/debug.cpp
+++ b/utils/debug.cpp
@@ -49,17 +49,6 @@ extern "C" void breakpoint(void) {
 }
 //! \endcond
 
-// ================================================================================================
-void report_fatal(const char* file, int line, const char* message) {
-  // FIXME_lmoriche: Obfuscate the message string
-  #if (defined(DEBUG))
-    fprintf(outFile, "%s:%d: %s\n", file, line, message);
-  #else
-    fprintf(outFile, "%s\n", message);
-  #endif
-  ::abort();
-}
-
 // ================================================================================================
 void report_warning(const char* message) { fprintf(outFile, "Warning: %s\n", message); }
 
diff --git a/utils/debug.hpp b/utils/debug.hpp
index dd3ffd6e..07cb3b61 100644
--- a/utils/debug.hpp
+++ b/utils/debug.hpp
@@ -66,9 +66,6 @@ extern FILE* outFile;
 extern "C" void breakpoint();
 //! \endcond
 
-//! \brief Report a Fatal exception message and abort.
-extern void report_fatal(const char* file, int line, const char* message);
-
 //! \brief Display a warning message.
 extern void report_warning(const char* message);
 
@@ -94,10 +91,10 @@ extern void log_printf(LogLevel level, const char* file, int line, uint64_t *sta
 #endif  // __INTEL_COMPILER
 
 //! \brief Abort the program if the invariant \a cond is false.
-#define guarantee(cond, message)                                                                       \
+#define guarantee(cond, format, ...)                                                               \
   if (!(cond)) {                                                                                   \
-    amd::report_fatal(__FILE__, __LINE__,  XSTR(message) );                                  \
-    amd::breakpoint();                                                                             \
+    amd::log_printf(amd::LOG_NONE, __FILE__, __LINE__, format, ##__VA_ARGS__);                     \
+    ::abort();                                                                                     \
   }
 
 #define fixme_guarantee(cond, ...) guarantee(cond, __VA_ARGS__)
diff --git a/utils/versions.hpp b/utils/versions.hpp
index 2a85df5e..c8c96219 100644
--- a/utils/versions.hpp
+++ b/utils/versions.hpp
@@ -28,7 +28,7 @@
 #endif  // AMD_PLATFORM_NAME
 
 #ifndef AMD_PLATFORM_BUILD_NUMBER
-#define AMD_PLATFORM_BUILD_NUMBER 3558
+#define AMD_PLATFORM_BUILD_NUMBER 3570
 #endif  // AMD_PLATFORM_BUILD_NUMBER
 
 #ifndef AMD_PLATFORM_REVISION_NUMBER