diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
new file mode 100644
index 000000000..8af844a60
--- /dev/null
+++ b/.azuredevops/rocm-ci.yml
@@ -0,0 +1,56 @@
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+  - repository: matching_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/HIP
+    ref: develop
+  pipelines:
+  - pipeline: hip_pipeline
+    source: HIP
+    trigger:
+      branches:
+        include:
+        - develop
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+trigger:
+  batch: true
+  branches:
+    include:
+    - develop
+  paths:
+    exclude:
+    - CODEOWNERS
+    - LICENCE
+    - '**/*.md'
+
+pr:
+  autoCancel: true
+  branches:
+    include:
+    - develop
+  paths:
+    exclude:
+    - CODEOWNERS
+    - LICENCE
+    - '**/*.md'
+  drafts: false
+
+jobs:
+# if the build reason is a resource trigger, it means trigger is HIP repo build
+# HIP repo build would have just built runtime, just copy their build products
+# this is to ensure clr has latest good package for combined-packaging jobs
+# combined-packaging jobs only have to look at clr pipeline for latest runtime
+# to remove logic of comparing build products from both clr and hip triggers
+  - ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/copyHIP.yml@pipelines_repo
+  - ${{ if ne(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo
diff --git a/CODEOWNERS b/CODEOWNERS
index 2990acceb..87f9d23e7 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-* @gargrahul @mangupta @rakesroy
+* @cpaquot @gandryey @skudchad @mangupta @rakesroy
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..1036e0729
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,142 @@
+# Contributing to HIP/CLR #
+
+We welcome contributions to the HIP project.
+CLR is a part of HIP runtime for the AMD platform.Please follow these details to help ensure your contributions will be successfully accepted.
+
+## Issue Discussion ##
+
+Please use the [GitHub Issue](https://github.com/ROCm/clr/issues) tab to notify us of issues.
+
+* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
+  comment or post to provide additional details, such as how you reproduced this issue.
+* If you're not sure if your issue is the same, err on the side of caution and file your issue.
+  You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
+  your issue as being the same as the existing issue, we'll close the duplicate.
+* If your issue doesn't exist, use the issue template to file a new issue.
+  * When filing an issue, be sure to provide as much information as possible, including script output so
+    we can collect information about your configuration. This helps reduce the time required to
+    reproduce your issue.
+  * Check your issue regularly, as we may require additional information to successfully reproduce the
+    issue.
+* You may also open an issue to ask questions to the maintainers about whether a proposed change
+  meets the acceptance criteria, or to discuss an idea pertaining to the library.
+
+## Acceptance Criteria ##
+
+clr is Common Language Runtime contains C++ codes for the implementation of HIP runtime APIs on the AMD platform.
+Bug fixes and performance are both important goals in clr. Because of this, when a pull request is created, the owner of the repository will review, and put it in automated testing to make sure,
+* The change will build on various OS platforms (Ubuntu, RHEL, etc.)
+* The build package will install and run the code on different GPU architectures (MI-series, Radeon series cards, etc.),
+* And the test results will achieve the goal as expected.
+
+## Code Structure ##
+
+clr contains three parts of codes,
+- `hipamd` - contains implementation for HIP runtime on the AMD platform, which includes
+   - `include/hip/amd_detail` for headers
+   - `/src` for all types of functionality implementation such as hip event, memory, module and texture, etc.
+
+- `opencl` - contains implementation of OpenCL on the AMD platform.
+
+- `rocclr` - contains common runtime used in HIP and OpenCL, which includes
+   - `include`, header files,
+   - `device`, implementation of GPU device related interfaces to the backend support,
+   - `cimpiler`, implementation of interfaces with compiler,
+   - `utils`, implementation of some useful utilities,
+   - `os`, implementation of OS related interfaces.
+
+
+## Coding Style ##
+
+clr is a C++ runtime API implementation on the AMD platform. It allows codeing in C++ programming language, and follows styles as below,
+- Code Indentation:
+    - Tabs should be expanded to spaces.
+    - Use 4 spaces indentation.
+- Capitalization and Naming
+    - Prefer camelCase for HIP interfaces and internal symbols.  Note HCC uses _ for separator.
+      This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational.
+    - Member variables should begin with a leading "_".  This allows them to be easily distinguished from other variables or functions.
+
+- `{}` placement
+    - namespace should be on same line as `{` and separated by a space.
+    - Single-line if statement should still use `{/}` pair (even though C++ does not require).
+    - For functions, the opening `{` should be placed on a new line.
+    - For if/else blocks, the opening `{` is placed on same line as the if/else. Use a space to separate `{` from if/else. For example,
+```console
+    if (foo) {
+        doFoo()
+    } else {
+        doFooElse();
+    }
+```
+
+- Miscellaneous
+    - All references in function parameter lists should be const.
+    - "ihip" means internal hip structures.  These should not be exposed through the HIP API.
+    - Keyword TODO refers to a note that should be addressed in long-term.  Could be style issue, software architecture, or known bugs.
+    - FIXME refers to a short-term bug that needs to be addressed.
+
+- `HIP_INIT_API()` should be placed at the start of each top-level HIP API.  This function will make sure the HIP runtime is initialized, and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match those of the parent function.
+- `hipExtGetLastError()` can be called as the AMD platform specific API, to return error code from last HIP API called from the active host thread. `hipGetLastError()` and `hipPeekAtLastError()` can also return the last error that was returned by any of the HIP runtime calls in the same host thread.
+- All HIP environment variables should begin with the keyword HIP_
+Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores.
+To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform.
+HIPCC or other tools may support additional environment variables which should follow the above convention.
+
+## Pull Request Guidelines ##
+
+By creating a pull request, you agree to the statements made in the code license section. Your pull request should target the default branch. Our current default branch is the develop branch, which serves as our integration branch.
+
+Follow existing best practice for writing a good Git commit message.
+
+Some tips:
+    http://chris.beams.io/posts/git-commit/
+    https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
+
+In particular :
+   - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
+     Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
+   - Subject should summarize the commit.  Do not end subject with a period.  Use a blank line
+     after the subject.
+
+### Deliverables ###
+
+HIP is an open source library. Because of this, we include the following license description at the top of every source file.
+If you create new source files in the repository, please include this text in them as well (replacing "xx" with the digits for the current year):
+```
+// Copyright (c) 20xx Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+```
+
+### Process ###
+
+After you create a PR, you can take a look at a diff of the changes you made using the PR's "Files" tab.
+
+PRs must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged.
+
+Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table
+to view logs associated with a check if it fails.
+
+During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is
+needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas.
+When a modification request has been completed, the conversation thread about it will be marked as resolved.
+
+To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
+
diff --git a/README.md b/README.md
index 9b46e3aba..8b5b0730d 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Building clr requires `rocm-hip-libraries` meta package, which provides the pre-
 
 Users can also build `OCL` and `HIP` at the same time by passing `-DCLR_BUILD_HIP=ON -DCLR_BUILD_OCL=ON` to configure command.
 
-For detail instructions, please refer to [how to build HIP](https://rocm.docs.amd.com/projects/HIP/en/latest/developer_guide/build.html)
+For detail instructions, please refer to [how to build HIP](https://rocm.docs.amd.com/projects/HIP/en/latest/install/build.html)
 
 ## Tests
 
diff --git a/hipamd/include/hip/amd_detail/amd_hip_atomic.h b/hipamd/include/hip/amd_detail/amd_hip_atomic.h
index e35a79abd..d6e4d8186 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_atomic.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_atomic.h
@@ -973,22 +973,13 @@ inline
 unsigned int atomicInc(unsigned int* address, unsigned int val)
 {
 #if defined(__gfx941__)
-  __device__
-  extern
-  unsigned int __builtin_amdgcn_atomic_inc(
-    unsigned int*,
-    unsigned int,
-    unsigned int,
-    unsigned int,
-    bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
-
   return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
     address,
     val,
     [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
     [=]() {
     return
-      __builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false);
+      __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
   });
 #else
     return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
@@ -1001,22 +992,13 @@ inline
 unsigned int atomicDec(unsigned int* address, unsigned int val)
 {
 #if defined(__gfx941__)
-  __device__
-  extern
-  unsigned int __builtin_amdgcn_atomic_dec(
-    unsigned int*,
-    unsigned int,
-    unsigned int,
-    unsigned int,
-    bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
-
   return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
     address,
     val,
     [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
     [=]() {
     return
-      __builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false);
+      __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
   });
 #else
   return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
index 204269a84..193ca9174 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_bf16.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
@@ -1,7 +1,7 @@
 /**
  * MIT License
  *
- * Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -81,6 +81,17 @@
  * To use these functions, include the header file \p hip_bf16.h in your program.
  */
 
+/**
+ * \defgroup HIP_INTRINSIC_BFLOAT16_RAW Bfloat16 Raw Struct
+ * \ingroup HIP_INTRINSIC_BFLOAT16
+ * To use these functions, include the header file \p hip_bf16.h in your program.
+ */
+
+/**
+ * \defgroup HIP_INTRINSIC_BFLOAT162_RAW Bfloat162 Raw Struct
+ * \ingroup HIP_INTRINSIC_BFLOAT16
+ * To use these functions, include the header file \p hip_bf16.h in your program.
+ */
 
 #ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
 #define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
@@ -93,13 +104,30 @@
 #include "device_library_decls.h"  // ocml conversion functions
 #include "math_fwd.h"              // ocml device functions
 
+#define __BF16_DEVICE__ __device__
 #if defined(__HIPCC_RTC__)
-#define __HOST_DEVICE__ __device__ static
+#define __BF16_HOST_DEVICE__ __BF16_DEVICE__
 #else
 #include <algorithm>
 #include <climits>
 #include <cmath>
-#define __HOST_DEVICE__ __host__ __device__ static inline
+#define __BF16_HOST_DEVICE__ __host__ __BF16_DEVICE__
+#endif
+#define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline
+#define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline
+
+#if defined(__AVX512VL__) and defined(__AVX512BF16__) and not defined(__HIP_DEVICE_COMPILE__)
+// Enable with -mavx512vl -mavx512bf16
+#if defined(__MINGW64__)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+#define HIP_BF16_AVX512_OP 1
+static_assert(sizeof(__bf16) == sizeof(unsigned short),
+              "sizeof __bf16 should match sizeof unsigned short");
+#else
+#define HIP_BF16_AVX512_OP 0
 #endif
 
 #define HIPRT_ONE_BF16 __float2bfloat16(1.0f)
@@ -118,72 +146,361 @@ static_assert(CHAR_BIT == 8, "byte size should be of 8 bits");
 #endif
 static_assert(sizeof(unsigned short) == 2, "size of unsigned short should be 2 bytes");
 
-/*! \brief Struct to represent a 16 bit brain floating point number. */
-struct __hip_bfloat16 {
-  unsigned short data;
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_RAW
+ * \brief represents raw bfloat16 type
+ */
+typedef struct __attribute__((aligned(2))) {
+  unsigned short x;
+} __hip_bfloat16_raw;
+
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT162_RAW
+ * \brief represents raw bfloat16x2 vector type
+ */
+typedef struct __attribute__((aligned(4))) {
+  unsigned short x;
+  unsigned short y;
+} __hip_bfloat162_raw;
+
+/**
+ * \defgroup HIP_INTRINSIC_BFLOAT16_STRUCT
+ * \ingroup HIP_INTRINSIC_BFLOAT16
+ * \brief Struct to represent a 16 bit brain floating point number.
+ * @{
+ */
+struct __attribute__((aligned(2))) __hip_bfloat16 {
+ private:
+  __BF16_HOST_DEVICE_STATIC__ float bfloatraw_2_float(unsigned short val) {
+#if HIP_BF16_AVX512_OP
+    union {
+      unsigned short us;
+      __bf16 bf16;
+    } u = {val};
+    return _mm_cvtsbh_ss(u.bf16);
+#else
+    unsigned int uval = val << 16;
+    union {
+      unsigned int u32;
+      float fp32;
+    } u = {uval};
+    return u.fp32;
+#endif
+  }
+  __BF16_HOST_DEVICE_STATIC__ unsigned short float_2_bfloatraw(float f) {
+#if HIP_BF16_AVX512_OP
+    union {
+      __bf16 bf16;
+      unsigned short us;
+    } u = {_mm_cvtness_sbh(f)};
+    return u.us;
+#else
+    union {
+      float fp32;
+      unsigned int u32;
+    } u = {f};
+    if (~u.u32 & 0x7f800000) {
+      // When the exponent bits are not all 1s, then the value is zero, normal,
+      // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+      // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+      // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+      // least significant bits of the float mantissa are greater than 0x8000,
+      // or if they are equal to 0x8000 and the least significant bit of the
+      // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+      // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+      // has the value 0x7f, then incrementing it causes it to become 0x00 and
+      // the exponent is incremented by one, which is the next higher FP value
+      // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+      // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+      // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+      // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+      // incrementing it causes it to become an exponent of 0xFF and a mantissa
+      // of 0x00, which is Inf, the next higher value to the unrounded value.
+      u.u32 += 0x7fff + ((u.u32 >> 16) & 1);  // Round to nearest, round to even
+    } else if (u.u32 & 0xffff) {
+      // When all of the exponent bits are 1, the value is Inf or NaN.
+      // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+      // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+      // bit being 1. Signaling NaN is indicated by the most significant
+      // mantissa bit being 0 but some other bit(s) being 1. If any of the
+      // lower 16 bits of the mantissa are 1, we set the least significant bit
+      // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+      // the bloat16's mantissa bits are all 0.
+      u.u32 |= 0x10000;  // Preserve signaling NaN
+    }
+    return static_cast<unsigned short>(u.u32 >> 16);
+#endif
+  }
+
+  __BF16_HOST_DEVICE_STATIC__ unsigned short double_2_bfloatraw(double d_in) {
+    union {
+      float fp32;
+      unsigned int u32;
+    } u = {static_cast<float>(d_in)};
+    double d = u.fp32;
+
+    // Round to odd
+    if ((d_in > 0.0 && d > d_in) || (d_in < 0.0 && d < d_in)) {
+      u.u32--;
+      u.u32 |= 1;
+    }
+
+    return float_2_bfloatraw(u.fp32);
+  }
+
+ protected:
+  /*! \brief raw representation of bfloat16 */
+  unsigned short __x;
+
+ public:
+  // TODO: SWDEV-452411
+  // Need to add constructor of __hip_bfloat16 from
+  // unsigned long long
+  // long long
+  // long
+  // unsigned long
+  // Casting directly to double might lead to double rounding.
+
+  /*! \brief create __hip_bfloat16 from an unsigned int */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned int val)
+      : __x(double_2_bfloatraw(static_cast<double>(val))) {}
+
+  /*! \brief create __hip_bfloat16 from a int */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(int val)
+      : __x(double_2_bfloatraw(static_cast<double>(val))) {}
+
+  /*! \brief create __hip_bfloat16 from an unsigned short */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned short val)
+      : __x(float_2_bfloatraw(static_cast<float>(val))) {}
+
+  /*! \brief create __hip_bfloat16 from a short */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(short val)
+      : __x(float_2_bfloatraw(static_cast<float>(val))) {}
+
+  /*! \brief create __hip_bfloat16 from a double */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(const double val) : __x(double_2_bfloatraw(val)) {}
+
+  /*! \brief create __hip_bfloat16 from a float */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(const float val) : __x(float_2_bfloatraw(val)) {}
+
+  /*! \brief create __hip_bfloat16 from a __hip_bfloat16_raw */
+  __BF16_HOST_DEVICE__ __hip_bfloat16(const __hip_bfloat16_raw& val) : __x(val.x) {}
+
+  /*! \brief default constructor */
+  __BF16_HOST_DEVICE__ __hip_bfloat16() = default;
+
+  /*! \brief return a __hip_bfloat16_raw */
+  __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const { return __hip_bfloat16_raw{__x}; }
+
+  /*! \brief return a __hip_bfloat16_raw cv qualifier */
+  __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const volatile {
+    return __hip_bfloat16_raw{__x};
+  }
+
+  /*! \brief return false if bfloat value is +0.0 or -0.0, returns true otherwise */
+  __BF16_HOST_DEVICE__ operator bool() const {
+    auto val = bfloatraw_2_float(__x);
+    return val != 0.0f && val != -0.0f;
+  }
+
+  /*! \brief return a casted char from underlying float val */
+  __BF16_HOST_DEVICE__ operator char() const { return static_cast<char>(bfloatraw_2_float(__x)); }
+
+  /*! \brief return a float */
+  __BF16_HOST_DEVICE__ operator float() const { return bfloatraw_2_float(__x); }
+
+  /*! \brief return a casted int casted from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator int() const { return static_cast<int>(bfloatraw_2_float(__x)); }
+
+  /*! \brief return a casted long casted from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator long() const { return static_cast<long>(bfloatraw_2_float(__x)); }
+
+  /*! \brief return a casted long long casted from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator long long() const {
+    return static_cast<long long>(bfloatraw_2_float(__x));
+  }
+
+  /*! \brief return a casted short casted from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator short() const { return static_cast<short>(bfloatraw_2_float(__x)); }
+
+  /*! \brief return a casted signed char from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator signed char() const {
+    return static_cast<signed char>(bfloatraw_2_float(__x));
+  }
+
+  /*! \brief return a casted unsigned char casted from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator unsigned char() const {
+    return static_cast<unsigned char>(bfloatraw_2_float(__x));
+  }
+
+  /*! \brief return a casted unsigned int casted from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator unsigned int() const {
+    return static_cast<unsigned int>(bfloatraw_2_float(__x));
+  }
+
+  /*! \brief return a casted unsigned from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator unsigned long() const {
+    return static_cast<unsigned long>(bfloatraw_2_float(__x));
+  }
+
+  /*! \brief return a casted unsigned long long from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator unsigned long long() const {
+    return static_cast<unsigned long long>(bfloatraw_2_float(__x));
+  }
+
+  /*! \brief return a casted unsigned short from float of underlying bfloat16 value */
+  __BF16_HOST_DEVICE__ operator unsigned short() const {
+    return static_cast<unsigned short>(bfloatraw_2_float(__x));
+  }
+
+  // TODO: SWDEV-452411 add operator which converts unsigned long long and long long to bfloat
+
+  /*! \brief assign value from an unsigned int */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned int val) {
+    __x = float_2_bfloatraw(static_cast<float>(val));
+    return *this;
+  }
+
+  /*! \brief assign value from a int */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(int val) {
+    __x = float_2_bfloatraw(static_cast<float>(val));
+    return *this;
+  }
+
+  /*! \brief assign value from an unsigned short */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned short val) {
+    __x = float_2_bfloatraw(static_cast<float>(val));
+    return *this;
+  }
+
+  /*! \brief assign value from a short int */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(short val) {
+    __x = float_2_bfloatraw(static_cast<float>(val));
+    return *this;
+  }
+
+  /*! \brief assign value from a double */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const double f) {
+    __x = float_2_bfloatraw(static_cast<float>(f));
+    return *this;
+  }
+
+  /*! \brief assign value from a float */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const float f) {
+    __x = float_2_bfloatraw(f);
+    return *this;
+  }
+
+  /*! \brief assign value from a __hip_bfloat16_raw */
+  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+  /*! \brief assign value from a __hip_bfloat16_raw volatile */
+  __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) volatile {
+    __x = hr.x;
+    return *this;
+  }
+
+  /*! \brief assign value from a __hip_bfloat16_raw cv qualifier */
+  __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(
+      const volatile __hip_bfloat16_raw& hr) volatile {
+    __x = hr.x;
+    return *this;
+  }
 };
+/**@}*/
+
+/**
+ * \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT
+ * \ingroup HIP_INTRINSIC_BFLOAT16
+ * \brief Struct to represent a two 16 bit brain floating point number.
+ * @{
+ */
+struct __attribute__((aligned(4))) __hip_bfloat162 {
+ protected:
+  __hip_bfloat16 x; /*! \brief raw representation of bfloat16 */
+  __hip_bfloat16 y; /*! \brief raw representation of bfloat16 */
+
+
+ public:
+  /*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */
+  __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162_raw& h2r)
+      : x(__hip_bfloat16(__hip_bfloat16_raw{h2r.x})),
+        y(__hip_bfloat16(__hip_bfloat16_raw{h2r.y})) {}
+
+  /*! \brief copy constructor of __hip_bfloat162 */
+  __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162& val) {
+    __hip_bfloat162_raw hr = val;
+    x = __hip_bfloat16_raw{hr.x};
+    y = __hip_bfloat16_raw{hr.y};
+  }
+
+  /*! \brief create __hip_bfloat162 from two __hip_bfloat16 */
+  __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat16& a, const __hip_bfloat16& b)
+      : x(a), y(b) {}
+
+  /*! \brief default constructor of __hip_bfloat162 */
+  __BF16_HOST_DEVICE__ __hip_bfloat162() = default;
+
+  /*! \brief return a __hip_bfloat162_raw */
+  __BF16_HOST_DEVICE__ operator __hip_bfloat162_raw() const {
+    __hip_bfloat16_raw l = x;
+    __hip_bfloat16_raw r = y;
+    return __hip_bfloat162_raw{l.x, r.x};
+  }
 
-/*! \brief Struct to represent two 16 bit brain floating point numbers. */
-struct __hip_bfloat162 {
-  __hip_bfloat16 x;
-  __hip_bfloat16 y;
+  /*! \brief return a float2 */
+  __BF16_HOST_DEVICE__ operator float2() const {
+#if HIP_BF16_AVX512_OP
+    union {
+      __hip_bfloat162_raw raw2;
+      __bf16 bf162[2];
+      static_assert(sizeof(__bf16[2]) == sizeof(__hip_bfloat162_raw));
+    } u;
+    u.raw2 = *this;
+    __m128bh pbf16{u.bf162[0], u.bf162[1], 0, 0};
+    __m128 pf32 = _mm_cvtpbh_ps(pbf16);
+    float2 ret(pf32[0], pf32[1]);
+#else
+    float2 ret(x, y);
+#endif
+    return ret;
+  }
+
+  /*! \brief assign value from __hip_bfloat162_raw */
+  __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162_raw& h2r) {
+    x = __hip_bfloat16(__hip_bfloat16_raw{h2r.x});
+    y = __hip_bfloat16(__hip_bfloat16_raw{h2r.y});
+    return *this;
+  }
+
+  /*! \brief assign value from __hip_bfloat162 */
+  __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162& src) {
+    __hip_bfloat162_raw hr = src;
+    x = __hip_bfloat16(__hip_bfloat16_raw{hr.x});
+    y = __hip_bfloat16(__hip_bfloat16_raw{hr.y});
+    return *this;
+  }
 };
+/**@}*/
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_CONV
  * \brief Converts bfloat16 to float
  */
-__HOST_DEVICE__ inline float __bfloat162float(__hip_bfloat16 a) {
-  unsigned int uval = 0;
-  uval = a.data << 16;
-  union {
-    unsigned int u32;
-    float fp32;
-  } u = {uval};
-  return u.fp32;
+__BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a) {
+  float ret = a;
+  return ret;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_CONV
  * \brief Converts float to bfloat16
  */
-__HOST_DEVICE__ __hip_bfloat16 __float2bfloat16(float f) {
-  __hip_bfloat16 ret;
-  union {
-    float fp32;
-    unsigned int u32;
-  } u = {f};
-  if (~u.u32 & 0x7f800000) {
-    // When the exponent bits are not all 1s, then the value is zero, normal,
-    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-    // least significant bits of the float mantissa are greater than 0x8000,
-    // or if they are equal to 0x8000 and the least significant bit of the
-    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-    // has the value 0x7f, then incrementing it causes it to become 0x00 and
-    // the exponent is incremented by one, which is the next higher FP value
-    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-    // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
-    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-    // incrementing it causes it to become an exponent of 0xFF and a mantissa
-    // of 0x00, which is Inf, the next higher value to the unrounded value.
-    u.u32 += 0x7fff + ((u.u32 >> 16) & 1);  // Round to nearest, round to even
-  } else if (u.u32 & 0xffff) {
-    // When all of the exponent bits are 1, the value is Inf or NaN.
-    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-    // bit being 1. Signaling NaN is indicated by the most significant
-    // mantissa bit being 0 but some other bit(s) being 1. If any of the
-    // lower 16 bits of the mantissa are 1, we set the least significant bit
-    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-    // the bloat16's mantissa bits are all 0.
-    u.u32 |= 0x10000;  // Preserve signaling NaN
-  }
-
-  ret.data = (u.u32 >> 16);
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f) {
+  __hip_bfloat16 ret{f};
   return ret;
 }
 
@@ -191,43 +508,51 @@ __HOST_DEVICE__ __hip_bfloat16 __float2bfloat16(float f) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Converts and moves bfloat162 to float2
  */
-__HOST_DEVICE__ float2 __bfloat1622float2(const __hip_bfloat162 a) {
-  return float2{__bfloat162float(a.x), __bfloat162float(a.y)};
+__BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a) {
+  float2 ret = a;
+  return ret;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Moves bfloat16 value to bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a) {
-  return __hip_bfloat162{a, a};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a) {
+  return __hip_bfloat162(a, a);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Reinterprets bits in a __hip_bfloat16 as a signed short integer
  */
-__HOST_DEVICE__ short int __bfloat16_as_short(const __hip_bfloat16 h) { return (short)h.data; }
+__BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h) {
+  short ret = h;
+  return ret;
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Reinterprets bits in a __hip_bfloat16 as an unsigned signed short integer
  */
-__HOST_DEVICE__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h) { return h.data; }
+__BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h) {
+  unsigned short ret = h;
+  return ret;
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Convert double to __hip_bfloat16
  */
-__HOST_DEVICE__ __hip_bfloat16 __double2bfloat16(const double a) {
-  return __float2bfloat16((float)a);
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a) {
+  __hip_bfloat16 ret{a};
+  return ret;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Convert float2 to __hip_bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
   return __hip_bfloat162{__float2bfloat16(a.x), __float2bfloat16(a.y)};
 }
 
@@ -235,97 +560,117 @@ __HOST_DEVICE__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Combine two __hip_bfloat16 to __hip_bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b) {
-  return __hip_bfloat162{a, b};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a,
+                                                               const __hip_bfloat16 b) {
+  return __hip_bfloat162(a, b);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Returns high 16 bits of __hip_bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a) { return a.y; }
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __hip_bfloat16(__hip_bfloat16_raw{hr.y});
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Returns high 16 bits of __hip_bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a) {
-  return __hip_bfloat162{a.y, a.y};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __hip_bfloat162(__hip_bfloat16_raw{hr.y}, __hip_bfloat16_raw{hr.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Converts high 16 bits of __hip_bfloat162 to float and returns the result
  */
-__HOST_DEVICE__ float __high2float(const __hip_bfloat162 a) { return __bfloat162float(a.y); }
+__BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __bfloat162float(__hip_bfloat16(__hip_bfloat16_raw{hr.y}));
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Extracts high 16 bits from each and combines them
  */
-__HOST_DEVICE__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a,
-                                                  const __hip_bfloat162 b) {
-  return __hip_bfloat162{a.y, b.y};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a,
+                                                              const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hip_bfloat162_raw{hr_a.y, hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Returns low 16 bits of __hip_bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a) { return a.x; }
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __hip_bfloat16(hr.x);
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Returns low 16 bits of __hip_bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a) {
-  return __hip_bfloat162{a.x, a.x};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __hip_bfloat162(hr.x, hr.x);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Converts low 16 bits of __hip_bfloat162 to float and returns the result
  */
-__HOST_DEVICE__ float __low2float(const __hip_bfloat162 a) { return __bfloat162float(a.x); }
+__BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __bfloat162float(__hip_bfloat16(__hip_bfloat16_raw{hr.x}));
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Swaps both halves
  */
-__HOST_DEVICE__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a) {
-  return __hip_bfloat162{a.y, a.x};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr = a;
+  return __hip_bfloat162(__hip_bfloat162_raw{hr.y, hr.x});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Extracts low 16 bits from each and combines them
  */
-__HOST_DEVICE__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{a.x, b.x};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a,
+                                                             const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hip_bfloat162_raw{hr_a.x, hr_b.x});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Reinterprets short int into a bfloat16
  */
-__HOST_DEVICE__ __hip_bfloat16 __short_as_bfloat16(const short int a) {
-  return __hip_bfloat16{(unsigned short)a};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a) {
+  return __hip_bfloat16(a);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_CONV
  * \brief Reinterprets unsigned short int into a bfloat16
  */
-__HOST_DEVICE__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a) {
-  return __hip_bfloat16{a};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a) {
+  return __hip_bfloat16(a);
 }
 
-
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Adds two bfloat16 values
  */
-__HOST_DEVICE__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
 }
 
@@ -333,7 +678,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Subtracts two bfloat16 values
  */
-__HOST_DEVICE__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __float2bfloat16(__bfloat162float(a) - __bfloat162float(b));
 }
 
@@ -341,7 +686,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Divides two bfloat16 values
  */
-__HOST_DEVICE__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __float2bfloat16(__bfloat162float(a) / __bfloat162float(b));
 }
 
@@ -349,8 +694,8 @@ __HOST_DEVICE__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Performs FMA of given bfloat16 values
  */
-__device__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b,
-                                 const __hip_bfloat16 c) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b,
+                                             const __hip_bfloat16 c) {
   return __float2bfloat16(
       __ocml_fma_f32(__bfloat162float(a), __bfloat162float(b), __bfloat162float(c)));
 }
@@ -359,7 +704,7 @@ __device__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b,
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Multiplies two bfloat16 values
  */
-__HOST_DEVICE__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
 }
 
@@ -367,85 +712,110 @@ __HOST_DEVICE__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Negate a bfloat16 value
  */
-__HOST_DEVICE__ __hip_bfloat16 __hneg(const __hip_bfloat16 a) {
-  auto ret = a;
-  ret.data ^= 0x8000;
-  return ret;
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a) {
+  __hip_bfloat16_raw hr = a;
+  hr.x ^= 0x8000;
+  return __hip_bfloat16(hr);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Returns absolute of a bfloat16
  */
-__HOST_DEVICE__ __hip_bfloat16 __habs(const __hip_bfloat16 a) {
-  auto ret = a;
-  ret.data &= 0x7FFF;
-  return ret;
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a) {
+  __hip_bfloat16_raw hr = a;
+  hr.x &= 0x7FFF;
+  return __hip_bfloat16(hr);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Divides bfloat162 values
  */
-__HOST_DEVICE__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{__float2bfloat16(__bfloat162float(a.x) / __bfloat162float(b.x)),
-                         __float2bfloat16(__bfloat162float(a.y) / __bfloat162float(b.y))};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__float2bfloat16(__bfloat162float(__hip_bfloat16_raw{hr_a.x}) /
+                                          __bfloat162float(__hip_bfloat16_raw{hr_b.x})),
+                         __float2bfloat16(__bfloat162float(__hip_bfloat16_raw{hr_a.y}) /
+                                          __bfloat162float(__hip_bfloat16_raw{hr_b.y})));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Returns absolute of a bfloat162
  */
-__HOST_DEVICE__ __hip_bfloat162 __habs2(const __hip_bfloat162 a) {
-  return __hip_bfloat162{__habs(a.x), __habs(a.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr_a = a;
+  return __hip_bfloat162(__habs(__hip_bfloat16_raw{hr_a.x}), __habs(__hip_bfloat16_raw{hr_a.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Adds two bfloat162 values
  */
-__HOST_DEVICE__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{__hadd(a.x, b.x), __hadd(a.y, b.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hadd(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
+                         __hadd(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Performs FMA of given bfloat162 values
  */
-__device__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b,
-                                   const __hip_bfloat162 c) {
-  return __hip_bfloat162{__hfma(a.x, b.x, c.x), __hfma(a.y, b.y, c.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b,
+                                               const __hip_bfloat162 c) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  __hip_bfloat162_raw hr_c = c;
+  return __hip_bfloat162(
+      __hfma(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}, __hip_bfloat16_raw{hr_c.x}),
+      __hfma(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}, __hip_bfloat16_raw{hr_c.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Multiplies two bfloat162 values
  */
-__HOST_DEVICE__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{__hmul(a.x, b.x), __hmul(a.y, b.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hmul(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
+                         __hmul(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Converts a bfloat162 into negative
  */
-__HOST_DEVICE__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a) {
-  return __hip_bfloat162{__hneg(a.x), __hneg(a.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr_a = a;
+  return __hip_bfloat162(__hneg(__hip_bfloat16_raw{hr_a.x}), __hneg(__hip_bfloat16_raw{hr_a.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Subtracts two bfloat162 values
  */
-__HOST_DEVICE__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{__hsub(a.x, b.x), __hsub(a.y, b.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hsub(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
+                         __hsub(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to multiply two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16 operator*(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16& l,
+                                                     const __hip_bfloat16& r) {
   return __hmul(l, r);
 }
 
@@ -453,7 +823,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator*(const __hip_bfloat16& l, const __hip_bf
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to multiply-assign two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat16& r) {
   l = __hmul(l, r);
   return l;
 }
@@ -462,13 +832,14 @@ __HOST_DEVICE__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to unary+ on a __hip_bfloat16 number
  */
-__HOST_DEVICE__ __hip_bfloat16 operator+(const __hip_bfloat16& l) { return l; }
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l) { return l; }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to add two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16 operator+(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l,
+                                                     const __hip_bfloat16& r) {
   return __hadd(l, r);
 }
 
@@ -476,13 +847,14 @@ __HOST_DEVICE__ __hip_bfloat16 operator+(const __hip_bfloat16& l, const __hip_bf
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to negate a __hip_bfloat16 number
  */
-__HOST_DEVICE__ __hip_bfloat16 operator-(const __hip_bfloat16& l) { return __hneg(l); }
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l) { return __hneg(l); }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to subtract two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16 operator-(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l,
+                                                     const __hip_bfloat16& r) {
   return __hsub(l, r);
 }
 
@@ -490,7 +862,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator-(const __hip_bfloat16& l, const __hip_bf
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to post increment a __hip_bfloat16 number
  */
-__HOST_DEVICE__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) {
   auto ret = l;
   l = __hadd(l, HIPRT_ONE_BF16);
   return ret;
@@ -500,7 +872,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to pre increment a __hip_bfloat16 number
  */
-__HOST_DEVICE__ __hip_bfloat16& operator++(__hip_bfloat16& l) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator++(__hip_bfloat16& l) {
   l = __hadd(l, HIPRT_ONE_BF16);
   return l;
 }
@@ -509,7 +881,7 @@ __HOST_DEVICE__ __hip_bfloat16& operator++(__hip_bfloat16& l) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to post decrement a __hip_bfloat16 number
  */
-__HOST_DEVICE__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) {
   auto ret = l;
   l = __hsub(l, HIPRT_ONE_BF16);
   return ret;
@@ -519,7 +891,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to pre decrement a __hip_bfloat16 number
  */
-__HOST_DEVICE__ __hip_bfloat16& operator--(__hip_bfloat16& l) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator--(__hip_bfloat16& l) {
   l = __hsub(l, HIPRT_ONE_BF16);
   return l;
 }
@@ -528,7 +900,7 @@ __HOST_DEVICE__ __hip_bfloat16& operator--(__hip_bfloat16& l) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to add-assign two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat16& r) {
   l = __hadd(l, r);
   return l;
 }
@@ -537,7 +909,7 @@ __HOST_DEVICE__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to subtract-assign two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat16& r) {
   l = __hsub(l, r);
   return l;
 }
@@ -546,7 +918,8 @@ __HOST_DEVICE__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to divide two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16 operator/(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16& l,
+                                                     const __hip_bfloat16& r) {
   return __hdiv(l, r);
 }
 
@@ -554,7 +927,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator/(const __hip_bfloat16& l, const __hip_bf
  * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
  * \brief Operator to divide-assign two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat16& r) {
   l = __hdiv(l, r);
   return l;
 }
@@ -563,7 +936,8 @@ __HOST_DEVICE__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to multiply two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162 operator*(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator*(const __hip_bfloat162& l,
+                                                      const __hip_bfloat162& r) {
   return __hmul2(l, r);
 }
 
@@ -571,7 +945,8 @@ __HOST_DEVICE__ __hip_bfloat162 operator*(const __hip_bfloat162& l, const __hip_
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to multiply-assign two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162& operator*=(__hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator*=(__hip_bfloat162& l,
+                                                        const __hip_bfloat162& r) {
   l = __hmul2(l, r);
   return l;
 }
@@ -580,13 +955,14 @@ __HOST_DEVICE__ __hip_bfloat162& operator*=(__hip_bfloat162& l, const __hip_bflo
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to unary+ on a __hip_bfloat162 number
  */
-__HOST_DEVICE__ __hip_bfloat162 operator+(const __hip_bfloat162& l) { return l; }
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l) { return l; }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to add two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162 operator+(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l,
+                                                      const __hip_bfloat162& r) {
   return __hadd2(l, r);
 }
 
@@ -594,13 +970,16 @@ __HOST_DEVICE__ __hip_bfloat162 operator+(const __hip_bfloat162& l, const __hip_
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to negate a __hip_bfloat162 number
  */
-__HOST_DEVICE__ __hip_bfloat162 operator-(const __hip_bfloat162& l) { return __hneg2(l); }
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l) {
+  return __hneg2(l);
+}
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to subtract two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162 operator-(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l,
+                                                      const __hip_bfloat162& r) {
   return __hsub2(l, r);
 }
 
@@ -608,7 +987,7 @@ __HOST_DEVICE__ __hip_bfloat162 operator-(const __hip_bfloat162& l, const __hip_
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to post increment a __hip_bfloat162 number
  */
-__HOST_DEVICE__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) {
   auto ret = l;
   l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
   return ret;
@@ -618,7 +997,7 @@ __HOST_DEVICE__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to pre increment a __hip_bfloat162 number
  */
-__HOST_DEVICE__ __hip_bfloat162& operator++(__hip_bfloat162& l) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator++(__hip_bfloat162& l) {
   l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
   return l;
 }
@@ -627,7 +1006,7 @@ __HOST_DEVICE__ __hip_bfloat162& operator++(__hip_bfloat162& l) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to post decrement a __hip_bfloat162 number
  */
-__HOST_DEVICE__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) {
   auto ret = l;
   l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
   return ret;
@@ -637,7 +1016,7 @@ __HOST_DEVICE__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to pre decrement a __hip_bfloat162 number
  */
-__HOST_DEVICE__ __hip_bfloat162& operator--(__hip_bfloat162& l) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator--(__hip_bfloat162& l) {
   l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
   return l;
 }
@@ -646,7 +1025,8 @@ __HOST_DEVICE__ __hip_bfloat162& operator--(__hip_bfloat162& l) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to add-assign two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162& operator+=(__hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator+=(__hip_bfloat162& l,
+                                                        const __hip_bfloat162& r) {
   l = __hadd2(l, r);
   return l;
 }
@@ -655,7 +1035,8 @@ __HOST_DEVICE__ __hip_bfloat162& operator+=(__hip_bfloat162& l, const __hip_bflo
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to subtract-assign two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162& operator-=(__hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator-=(__hip_bfloat162& l,
+                                                        const __hip_bfloat162& r) {
   l = __hsub2(l, r);
   return l;
 }
@@ -664,7 +1045,8 @@ __HOST_DEVICE__ __hip_bfloat162& operator-=(__hip_bfloat162& l, const __hip_bflo
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to divide two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162 operator/(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator/(const __hip_bfloat162& l,
+                                                      const __hip_bfloat162& r) {
   return __h2div(l, r);
 }
 
@@ -672,7 +1054,8 @@ __HOST_DEVICE__ __hip_bfloat162 operator/(const __hip_bfloat162& l, const __hip_
  * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
  * \brief Operator to divide-assign two __hip_bfloat162 numbers
  */
-__HOST_DEVICE__ __hip_bfloat162& operator/=(__hip_bfloat162& l, const __hip_bfloat162& r) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator/=(__hip_bfloat162& l,
+                                                        const __hip_bfloat162& r) {
   l = __h2div(l, r);
   return l;
 }
@@ -681,7 +1064,7 @@ __HOST_DEVICE__ __hip_bfloat162& operator/=(__hip_bfloat162& l, const __hip_bflo
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values
  */
-__HOST_DEVICE__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __bfloat162float(a) == __bfloat162float(b);
 }
 
@@ -689,7 +1072,7 @@ __HOST_DEVICE__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - unordered equal
  */
-__HOST_DEVICE__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return !(__bfloat162float(a) < __bfloat162float(b)) &&
       !(__bfloat162float(a) > __bfloat162float(b));
 }
@@ -698,7 +1081,7 @@ __HOST_DEVICE__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - greater than
  */
-__HOST_DEVICE__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __bfloat162float(a) > __bfloat162float(b);
 }
 
@@ -706,7 +1089,7 @@ __HOST_DEVICE__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - unordered greater than
  */
-__HOST_DEVICE__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return !(__bfloat162float(a) <= __bfloat162float(b));
 }
 
@@ -714,7 +1097,7 @@ __HOST_DEVICE__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - greater than equal
  */
-__HOST_DEVICE__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __bfloat162float(a) >= __bfloat162float(b);
 }
 
@@ -722,7 +1105,7 @@ __HOST_DEVICE__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - unordered greater than equal
  */
-__HOST_DEVICE__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return !(__bfloat162float(a) < __bfloat162float(b));
 }
 
@@ -730,7 +1113,7 @@ __HOST_DEVICE__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - not equal
  */
-__HOST_DEVICE__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __bfloat162float(a) != __bfloat162float(b);
 }
 
@@ -738,7 +1121,7 @@ __HOST_DEVICE__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - unordered not equal
  */
-__HOST_DEVICE__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return !(__bfloat162float(a) == __bfloat162float(b));
 }
 
@@ -746,7 +1129,7 @@ __HOST_DEVICE__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - return max
  */
-__HOST_DEVICE__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b) {
 #if __HIP_DEVICE_COMPILE__
   return __float2bfloat16(__ocml_fmax_f32(__bfloat162float(a), __bfloat162float(b)));
 #else
@@ -758,7 +1141,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - return min
  */
-__HOST_DEVICE__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b) {
 #if __HIP_DEVICE_COMPILE__
   return __float2bfloat16(__ocml_fmin_f32(__bfloat162float(a), __bfloat162float(b)));
 #else
@@ -770,7 +1153,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - less than operator
  */
-__HOST_DEVICE__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __bfloat162float(a) < __bfloat162float(b);
 }
 
@@ -778,7 +1161,7 @@ __HOST_DEVICE__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - unordered less than
  */
-__HOST_DEVICE__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return !(__bfloat162float(a) >= __bfloat162float(b));
 }
 
@@ -786,7 +1169,7 @@ __HOST_DEVICE__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - less than equal
  */
-__HOST_DEVICE__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return __bfloat162float(a) <= __bfloat162float(b);
 }
 
@@ -794,7 +1177,7 @@ __HOST_DEVICE__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Compare two bfloat162 values - unordered less than equal
  */
-__HOST_DEVICE__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
+__BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
   return !(__bfloat162float(a) > __bfloat162float(b));
 }
 
@@ -802,208 +1185,282 @@ __HOST_DEVICE__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Checks if number is inf
  */
-__HOST_DEVICE__ int __hisinf(const __hip_bfloat16 a) {
-  unsigned short sign = a.data & 0x8000U;
-#if __HIP_DEVICE_COMPILE__
-  int res = __ocml_isinf_f32(__bfloat162float(a));
-#else
-  int res = std::isinf(__bfloat162float(a)) ? 1 : 0;
-#endif
-  return (res == 0) ? res : ((sign != 0U) ? -res : res);
+__BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a) {
+  __hip_bfloat16_raw hr = a;
+  return !(~hr.x & 0x7f80) && !(hr.x & 0x7f);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Checks if number is nan
  */
-__HOST_DEVICE__ bool __hisnan(const __hip_bfloat16 a) {
-#if __HIP_DEVICE_COMPILE__
-  return __ocml_isnan_f32(__bfloat162float(a));
-#else
-  return std::isnan(__bfloat162float(a));
-#endif
+__BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a) {
+  __hip_bfloat16_raw hr = a;
+  return !(~hr.x & 0x7f80) && +(hr.x & 0x7f);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Checks if two numbers are equal
  */
-__HOST_DEVICE__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __heq(a.x, b.x) && __heq(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __heq(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __heq(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Checks if two numbers are equal - unordered
  */
-__HOST_DEVICE__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hequ(a.x, b.x) && __hequ(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hequ(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hequ(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a >= b
  */
-__HOST_DEVICE__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hge(a.x, b.x) && __hge(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hge(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hge(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a >= b - unordered
  */
-__HOST_DEVICE__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hgeu(a.x, b.x) && __hgeu(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hgeu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hgeu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a > b
  */
-__HOST_DEVICE__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hgt(a.x, b.x) && __hgt(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hgt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hgt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a > b - unordered
  */
-__HOST_DEVICE__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hgtu(a.x, b.x) && __hgtu(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hgtu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hgtu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a <= b
  */
-__HOST_DEVICE__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hle(a.x, b.x) && __hle(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hle(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hle(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a <= b - unordered
  */
-__HOST_DEVICE__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hleu(a.x, b.x) && __hleu(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hleu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hleu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a < b
  */
-__HOST_DEVICE__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hlt(a.x, b.x) && __hlt(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hlt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hlt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a < b - unordered
  */
-__HOST_DEVICE__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hltu(a.x, b.x) && __hltu(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hltu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
+      __hltu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a != b
  */
-__HOST_DEVICE__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hne(a.x, b.x) && __hne(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hne(__hip_bfloat16(__hip_bfloat16_raw{hr_a.x}),
+               __hip_bfloat16(__hip_bfloat16_raw{hr_b.x})) &&
+      __hne(__hip_bfloat16(__hip_bfloat16_raw{hr_a.y}), __hip_bfloat16(__hip_bfloat16_raw{hr_b.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a != b
  */
-__HOST_DEVICE__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hneu(a.x, b.x) && __hneu(a.y, b.y);
+__BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hneu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ||
+      __hneu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a != b, returns 1.0 if equal, otherwise 0.0
  */
-__HOST_DEVICE__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{{__heq(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__heq(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a,
+                                                   const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162{
+      {__heq(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16},
+      {__heq(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a >= b, returns 1.0 if greater than equal, otherwise 0.0
  */
-__HOST_DEVICE__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{{__hge(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__hge(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a,
+                                                   const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162{
+      {__hge(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16},
+      {__hge(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a > b, returns 1.0 if greater than equal, otherwise 0.0
  */
-__HOST_DEVICE__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{{__hgt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__hgt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a,
+                                                   const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162{
+      {__hgt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16},
+      {__hgt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ONE_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a is NaN, returns 1.0 if NaN, otherwise 0.0
  */
-__HOST_DEVICE__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a) {
-  return __hip_bfloat162{{__hisnan(a.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__hisnan(a.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a) {
+  __hip_bfloat162_raw hr_a = a;
+  return __hip_bfloat162{{__hisnan(__hip_bfloat16_raw{hr_a.x}) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
+                         {__hisnan(__hip_bfloat16_raw{hr_a.y}) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a <= b, returns 1.0 if greater than equal, otherwise 0.0
  */
-__HOST_DEVICE__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{{__hle(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__hle(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a,
+                                                   const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162{
+      {__hle(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16},
+      {__hle(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Check for a < b, returns 1.0 if greater than equal, otherwise 0.0
  */
-__HOST_DEVICE__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{{__hlt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__hlt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a,
+                                                   const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162{
+      {__hlt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16},
+      {__hlt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Returns max of two elements
  */
-__HOST_DEVICE__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{__hmax(a.x, b.x), __hmax(a.y, b.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hmax(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
+                         __hmax(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Returns min of two elements
  */
-__HOST_DEVICE__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{__hmin(a.x, b.x), __hmin(a.y, b.y)};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a,
+                                                    const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162(__hmin(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
+                         __hmin(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Checks for not equal to
  */
-__HOST_DEVICE__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
-  return __hip_bfloat162{{__hne(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
-                         {__hne(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a,
+                                                   const __hip_bfloat162 b) {
+  __hip_bfloat162_raw hr_a = a;
+  __hip_bfloat162_raw hr_b = b;
+  return __hip_bfloat162{
+      {__hne(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16},
+      {__hne(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
+                                                                     : HIPRT_ZERO_BF16}};
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform an equal compare on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r) {
   return __heq(l, r);
 }
 
@@ -1011,7 +1468,7 @@ __HOST_DEVICE__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform a not equal on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
   return __hne(l, r);
 }
 
@@ -1019,7 +1476,7 @@ __HOST_DEVICE__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform a less than on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) {
   return __hlt(l, r);
 }
 
@@ -1027,7 +1484,7 @@ __HOST_DEVICE__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r)
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform a less than equal on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
   return __hle(l, r);
 }
 
@@ -1035,7 +1492,7 @@ __HOST_DEVICE__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform a greater than on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) {
   return __hgt(l, r);
 }
 
@@ -1043,7 +1500,7 @@ __HOST_DEVICE__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r)
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform a greater than equal on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
+__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
   return __hge(l, r);
 }
 
@@ -1051,55 +1508,60 @@ __HOST_DEVICE__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Operator to perform an equal compare on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator==(const __hip_bfloat162& l, const __hip_bfloat162& r) {
-  return __heq(l.x, r.x) && __heq(l.y, r.y);
+__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+  float2 ret = __heq2(l, r);
+  return ret.x != 0.0f && ret.y != 0.0f;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Operator to perform a not equal on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator!=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
-  return __hne(l.x, r.x) || __hne(l.y, r.y);
+__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+  return !(l == r);
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Operator to perform a less than on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator<(const __hip_bfloat162& l, const __hip_bfloat162& r) {
-  return __hlt(l.x, r.x) && __hlt(l.y, r.y);
+__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+  float2 fl = l, fr = r;
+  return fl.x < fr.x && fl.x < fr.y;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Operator to perform a less than equal on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator<=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
-  return __hle(l.x, r.x) && __hle(l.y, r.y);
+__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+  float2 fl = l, fr = r;
+  return fl.x <= fr.x && fl.x <= fr.y;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_COMP
  * \brief Operator to perform a greater than on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator>(const __hip_bfloat162& l, const __hip_bfloat162& r) {
-  return __hgt(l.x, r.x) && __hgt(l.y, r.y);
+__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+  float2 fl = l, fr = r;
+  return fl.x > fr.x && fl.x > fr.y;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_COMP
  * \brief Operator to perform a greater than equal on two __hip_bfloat16 numbers
  */
-__HOST_DEVICE__ bool operator>=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
-  return __hge(l.x, r.x) && __hge(l.y, r.y);
+__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
+  float2 fl = l, fr = r;
+  return fl.x >= fr.x && fl.x >= fr.y;
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate ceil of bfloat16
  */
-__device__ __hip_bfloat16 hceil(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_ceil_f32(__bfloat162float(h)));
 }
 
@@ -1107,7 +1569,7 @@ __device__ __hip_bfloat16 hceil(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate cosine of bfloat16
  */
-__device__ __hip_bfloat16 hcos(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_cos_f32(__bfloat162float(h)));
 }
 
@@ -1115,7 +1577,7 @@ __device__ __hip_bfloat16 hcos(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate exponential of bfloat16
  */
-__device__ __hip_bfloat16 hexp(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_exp_f32(__bfloat162float(h)));
 }
 
@@ -1123,7 +1585,7 @@ __device__ __hip_bfloat16 hexp(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate exponential 10 of bfloat16
  */
-__device__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_exp10_f32(__bfloat162float(h)));
 }
 
@@ -1131,7 +1593,7 @@ __device__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate exponential 2 of bfloat16
  */
-__device__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_exp2_f32(__bfloat162float(h)));
 }
 
@@ -1139,7 +1601,7 @@ __device__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate floor of bfloat16
  */
-__device__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_floor_f32(__bfloat162float(h)));
 }
 
@@ -1147,7 +1609,7 @@ __device__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate natural log of bfloat16
  */
-__device__ __hip_bfloat16 hlog(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_log_f32(__bfloat162float(h)));
 }
 
@@ -1155,7 +1617,7 @@ __device__ __hip_bfloat16 hlog(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate log 10 of bfloat16
  */
-__device__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_log10_f32(__bfloat162float(h)));
 }
 
@@ -1163,7 +1625,7 @@ __device__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate log 2 of bfloat16
  */
-__device__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_log2_f32(__bfloat162float(h)));
 }
 
@@ -1171,7 +1633,7 @@ __device__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate reciprocal
  */
-__device__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) {
   return __float2bfloat16(1.0f / (__bfloat162float(h)));
 }
 
@@ -1179,7 +1641,7 @@ __device__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Round to nearest int
  */
-__device__ __hip_bfloat16 hrint(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_rint_f32(__bfloat162float(h)));
 }
 
@@ -1187,7 +1649,7 @@ __device__ __hip_bfloat16 hrint(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Reciprocal square root
  */
-__device__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_rsqrt_f32(__bfloat162float(h)));
 }
 
@@ -1195,7 +1657,7 @@ __device__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate sin of bfloat16
  */
-__device__ __hip_bfloat16 hsin(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_sin_f32(__bfloat162float(h)));
 }
 
@@ -1203,7 +1665,7 @@ __device__ __hip_bfloat16 hsin(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate sqrt of bfloat16
  */
-__device__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_sqrt_f32(__bfloat162float(h)));
 }
 
@@ -1211,7 +1673,7 @@ __device__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
  * \brief Calculate truncate of bfloat16
  */
-__device__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) {
+__BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) {
   return __float2bfloat16(__ocml_trunc_f32(__bfloat162float(h)));
 }
 
@@ -1219,119 +1681,134 @@ __device__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) {
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate ceil of bfloat162
  */
-__device__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hceil(h.x), hceil(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hceil(__hip_bfloat16_raw{hr.x}), hceil(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate cosine of bfloat162
  */
-__device__ __hip_bfloat162 h2cos(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hcos(h.x), hcos(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hcos(__hip_bfloat16_raw{hr.x}), hcos(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate exponential of bfloat162
  */
-__device__ __hip_bfloat162 h2exp(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hexp(h.x), hexp(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hexp(__hip_bfloat16_raw{hr.x}), hexp(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate exponential 10 of bfloat162
  */
-__device__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hexp10(h.x), hexp10(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hexp10(__hip_bfloat16_raw{hr.x}), hexp10(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate exponential 2 of bfloat162
  */
-__device__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hexp2(h.x), hexp2(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hexp2(__hip_bfloat16_raw{hr.x}), hexp2(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate floor of bfloat162
  */
-__device__ __hip_bfloat162 h2floor(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hfloor(h.x), hfloor(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hfloor(__hip_bfloat16_raw{hr.x}), hfloor(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate natural log of bfloat162
  */
-__device__ __hip_bfloat162 h2log(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hlog(h.x), hlog(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hlog(__hip_bfloat16_raw{hr.x}), hlog(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate log 10 of bfloat162
  */
-__device__ __hip_bfloat162 h2log10(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hlog10(h.x), hlog10(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hlog10(__hip_bfloat16_raw{hr.x}), hlog10(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate log 2 of bfloat162
  */
-__device__ __hip_bfloat162 h2log2(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hlog2(h.x), hlog2(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hlog2(__hip_bfloat16_raw{hr.x}), hlog2(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate vector reciprocal
  */
-__device__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hrcp(h.x), hrcp(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hrcp(__hip_bfloat16_raw{hr.x}), hrcp(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate vector round to nearest int
  */
-__device__ __hip_bfloat162 h2rint(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hrint(h.x), hrint(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hrint(__hip_bfloat16_raw{hr.x}), hrint(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate vector reciprocal square root
  */
-__device__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hrsqrt(h.x), hrsqrt(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hrsqrt(__hip_bfloat16_raw{hr.x}), hrsqrt(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate sin of bfloat162
  */
-__device__ __hip_bfloat162 h2sin(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hsin(h.x), hsin(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hsin(__hip_bfloat16_raw{hr.x}), hsin(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate sqrt of bfloat162
  */
-__device__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h) {
-  return __hip_bfloat162{hsqrt(h.x), hsqrt(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(hsqrt(__hip_bfloat16_raw{hr.x}), hsqrt(__hip_bfloat16_raw{hr.y}));
 }
 
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
  * \brief Calculate truncate of bfloat162
  */
-__device__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
-  return __hip_bfloat162{htrunc(h.x), htrunc(h.y)};
+__BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
+  __hip_bfloat162_raw hr = h;
+  return __hip_bfloat162(htrunc(__hip_bfloat16_raw{hr.x}), htrunc(__hip_bfloat16_raw{hr.y}));
 }
 #endif
diff --git a/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/hipamd/include/hip/amd_detail/amd_hip_fp16.h
index 62d88a375..81883afc9 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_fp16.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_fp16.h
@@ -1800,7 +1800,7 @@ THE SOFTWARE.
              return tmp.h;
          }
     #endif // defined(__cplusplus)
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(_MSC_VER)
     #if !defined(__HIPCC_RTC__)
       #include "hip_fp16_gcc.h"
     #endif
diff --git a/hipamd/include/hip/amd_detail/amd_hip_fp8.h b/hipamd/include/hip/amd_detail/amd_hip_fp8.h
new file mode 100644
index 000000000..e54c70241
--- /dev/null
+++ b/hipamd/include/hip/amd_detail/amd_hip_fp8.h
@@ -0,0 +1,1391 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * \file
+ * \brief amd_hip_fp8.h header, for AMD fp8 data types
+ */
+
+#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
+#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
+
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && __HIP_DEVICE_COMPILE__
+#define HIP_FP8_CVT_FAST_PATH 1
+#else
+#define HIP_FP8_CVT_FAST_PATH 0
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/amd_hip_common.h>
+#include <climits>
+
+#include "host_defines.h"          // __hip_internal::
+#include "amd_hip_vector_types.h"  // float2 etc
+#include "amd_hip_fp16.h"          // __half_raw
+#include "amd_hip_bf16.h"          // bf16
+#include "math_fwd.h"              // ocml device functions
+#endif                             // !defined(__HIPCC_RTC__)
+
+#if defined(__HIPCC_RTC__)
+#define __FP8_HOST_DEVICE__ __device__
+#define __FP8_HOST_DEVICE_STATIC__ __FP8_HOST_DEVICE__ static
+#else
+#define __FP8_HOST_DEVICE__ __host__ __device__
+#define __FP8_HOST_DEVICE_STATIC__ __FP8_HOST_DEVICE__ static inline
+#endif  // __HIPCC_RTC__
+
+#if !defined(__HIPCC_RTC__)
+static_assert(CHAR_BIT == 8, "byte size should be of 8 bits");
+#endif
+static_assert(sizeof(unsigned char) == 1);
+static_assert(sizeof(unsigned short int) == 2);
+static_assert(sizeof(unsigned int) == 4);
+
+/**
+ * \brief Describes FP8 interpretation
+ */
+enum __hip_fp8_interpretation_t {
+  __HIP_E4M3_FNUZ = 0, /**< Standard FP8 */
+  __HIP_E5M2_FNUZ = 1, /**< BF8 */
+};
+
+/**
+ * \brief Describes saturation behavior
+ */
+enum __hip_saturation_t {
+  __HIP_NOSAT = 0,     /**< No saturation */
+  __HIP_SATFINITE = 1, /**< Saturate to finite */
+};
+
+/** \typedef __hip_fp8_storage_t
+ *
+ * \brief type to store single fp8 number
+ */
+typedef unsigned char __hip_fp8_storage_t;
+
+
+/** \typedef __hip_fp8x2_storage_t
+ *
+ * \brief type to store two fp8 numbers
+ */
+typedef unsigned short int __hip_fp8x2_storage_t;
+
+
+/** \typedef __hip_fp8x4_storage_t
+ *
+ * \brief type to store four fp8 numbers
+ */
+typedef unsigned int __hip_fp8x4_storage_t;
+
+namespace internal {
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39
+// This has been modified to add double types conversion as well
+template <typename T, bool negative_zero_nan>
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t cast_to_f8(T _x, int wm, int we, bool clip = false,
+                                                          bool stoch = false,
+                                                          unsigned int rng = 0) {
+  constexpr bool is_half = __hip_internal::is_same<T, _Float16>::value;
+  constexpr bool is_float = __hip_internal::is_same<T, float>::value;
+  constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+  static_assert(is_half || is_float || is_double, "Only half, float and double can be cast to f8");
+
+  const int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10);
+  unsigned long long x;
+
+  if (sizeof(T) == 8)
+    x = reinterpret_cast<unsigned long long&>(_x);
+  else if (sizeof(T) == 4)
+    x = reinterpret_cast<unsigned int&>(_x);
+  else
+    x = reinterpret_cast<unsigned short int&>(_x);
+
+
+  unsigned long long head, mantissa;
+  int exponent, bias;
+  unsigned int sign;
+
+  if (sizeof(T) == 8) {
+    head = x & 0xFFF0000000000000ull;
+    mantissa = x & 0xFFFFFFFFFFFFFull;
+    exponent = (head >> 52) & 0x7FF;
+    sign = head >> 63;
+    bias = 1023;
+  } else if (sizeof(T) == 4) {
+    head = x & 0xFF800000;
+    mantissa = x & 0x7FFFFF;
+    exponent = (head >> 23) & 0xFF;
+    sign = head >> 31;
+    bias = 127;
+  } else {
+    head = x & 0xFC00;
+    mantissa = x & 0x3FF;
+    exponent = (head >> 10) & 0x1F;
+    sign = head >> 15;
+    bias = 15;
+  }
+
+  unsigned int signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+  // Deal with inf and NaNs
+  if (negative_zero_nan) {
+    if (sizeof(T) == 8) {
+      if ((x & 0x7FF0000000000000ull) == 0x7FF0000000000000ull) return 0x80;
+    } else if (sizeof(T) == 4) {
+      if ((x & 0x7F800000) == 0x7F800000) return 0x80;
+    } else {
+      if ((x & 0x7C00) == 0x7C00) return 0x80;
+    }
+  } else {
+    if (sizeof(T) == 8) {
+      if ((x & 0x7FF0000000000000ull) == 0x7FF0000000000000ull)
+        return signed_inf + (mantissa != 0 ? 1 : 0);
+    } else if (sizeof(T) == 4) {
+      if ((x & 0x7F800000) == 0x7F800000) return signed_inf + (mantissa != 0 ? 1 : 0);
+    } else {
+      if ((x & 0x7C00) == 0x7C00) return signed_inf + (mantissa != 0 ? 1 : 0);
+    }
+  }
+
+  if (x == 0) {
+    return 0;
+  }
+
+  // First need to check if it is normal or denorm as there is a difference of implict 1
+  // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+  // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+  // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+  // exponent and mantissa again
+
+  // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+  const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
+  const int f8_denormal_act_exponent = 1 - f8_bias;  // actual exponent of f8 denormal
+  // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+  // f8_exponent is the converted f8 exponent with bias encoding
+  // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+  // the difference needs to be adjusted and mantissa shifted
+  int act_exponent, f8_exponent, exponent_diff;
+
+  if (exponent == 0) {  // fp32/fp16 is in denormal.
+    /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal has
+exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that there are some numbers in
+fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal. In
+this case, the fp16 mantissa should be shift left by 1  */
+    act_exponent = exponent - bias + 1;
+    exponent_diff = f8_denormal_act_exponent -
+        act_exponent;  // actual exponent is exponent-bias+1 as it is denormal
+  } else {             // fp32/fp16 is normal with implicit 1
+    act_exponent = exponent - bias;
+    if (act_exponent <= f8_denormal_act_exponent) {
+      /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+actual exponent is -7, it is actually larger due to the implict 1,
+Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+      exponent_diff = f8_denormal_act_exponent - act_exponent;
+    } else {              // both fp32/fp16 and f8 are in normal range
+      exponent_diff = 0;  // exponent_diff=0 does not mean there is no difference for this case,
+                          // act_exponent could be larger. Just that it does not need shift mantissa
+    }
+    mantissa += (1ull << mfmt);  // Add the implicit 1 into mantissa
+  }
+
+  bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) ==
+      (1ull << (mfmt - wm + exponent_diff - 1));
+  /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we shift
+right as shift right could rip off some residual part and make something not midpoint look like
+midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than midpoint, but
+after shift right by 4 bits, it would look like midpoint.
+*/
+
+  if (exponent_diff > 0)
+    mantissa >>= exponent_diff;
+  else if (exponent_diff == -1)
+    mantissa <<= -exponent_diff;
+  bool implicit_one = mantissa & (1ull << mfmt);
+  // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+  f8_exponent =
+      (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+  // Now we have the exponent and mantissa adjusted
+  unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1;
+  bool odd =
+      mantissa & (1ull << (mfmt - wm));  // if the least significant bit that is not truncated is 1
+  mantissa +=
+      (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask;
+
+  // Now we deal with overflow
+  if (f8_exponent == 0) {
+    if ((1ull << mfmt) & mantissa) {
+      f8_exponent = 1;  // denormal overflow to become normal, promote exponent
+    }
+  } else {
+    if ((1ull << (mfmt + 1)) & mantissa) {
+      mantissa >>= 1;
+      f8_exponent++;
+    }
+  }
+
+  mantissa >>= (mfmt - wm);
+
+  // above range: quantize to maximum possible float of the same sign
+  const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
+  if (f8_exponent > max_exp) {
+    if (clip) {
+      mantissa = (1 << wm) - 1;
+      f8_exponent = max_exp;
+    } else {
+      return signed_inf;
+    }
+  }
+
+  if (f8_exponent == 0 && mantissa == 0) return negative_zero_nan ? 0 : (sign << 7);
+  mantissa &= (1 << wm) - 1;
+  return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220
+// This has been modified to handle double types as well
+template <typename T, bool negative_zero_nan>
+__FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we) {
+  constexpr bool is_half = __hip_internal::is_same<T, _Float16>::value;
+  constexpr bool is_float = __hip_internal::is_same<T, float>::value;
+  constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+  static_assert(is_half || is_float || is_double, "only half, float and double are supported");
+
+  constexpr int weo = is_half ? 5 : (is_float ? 8 : 11);
+  constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52);
+
+  T fInf, fNegInf, fNaN, fNeg0;
+  if (is_half) {
+    const unsigned short int ihInf = 0x7C00;
+    const unsigned short int ihNegInf = 0xFC00;
+    const unsigned short int ihNaN = 0x7C01;
+    const unsigned short int ihNeg0 = 0x8000;
+    fInf = reinterpret_cast<const _Float16&>(ihInf);
+    fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
+    fNaN = reinterpret_cast<const _Float16&>(ihNaN);
+    fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
+  } else if (is_float) {
+    const unsigned int ifInf = 0x7F800000;
+    const unsigned int ifNegInf = 0xFF800000;
+    const unsigned int ifNaN = 0x7F800001;
+    const unsigned int ifNeg0 = 0x80000000;
+    fInf = reinterpret_cast<const float&>(ifInf);
+    fNegInf = reinterpret_cast<const float&>(ifNegInf);
+    fNaN = reinterpret_cast<const float&>(ifNaN);
+    fNeg0 = reinterpret_cast<const float&>(ifNeg0);
+  } else if (is_double) {
+    const unsigned long long ifInf = 0x7FF0000000000000ull;
+    const unsigned long long ifNegInf = 0xFFF0000000000000ull;
+    const unsigned long long ifNaN = 0x7FF0000000000001ull;
+    const unsigned long long ifNeg0 = 0x8000000000000000ull;
+    fInf = reinterpret_cast<const double&>(ifInf);
+    fNegInf = reinterpret_cast<const double&>(ifNegInf);
+    fNaN = reinterpret_cast<const double&>(ifNaN);
+    fNeg0 = reinterpret_cast<const double&>(ifNeg0);
+  }
+
+  if (x == 0) {
+    return 0;
+  }
+
+  unsigned long long sign = x >> 7;
+  unsigned long long mantissa = x & ((1 << wm) - 1);
+  int exponent = (x & 0x7F) >> wm;
+  if (negative_zero_nan) {
+    if (x == 0x80) return fNaN;
+  } else {
+    if (x == 0x80) return fNeg0;
+    if (exponent == ((1 << we) - 1)) return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+  }
+
+  typename __hip_internal::conditional<
+      sizeof(T) == 2, unsigned short int,
+      typename __hip_internal::conditional<sizeof(T) == 4, unsigned int,
+                                           unsigned long long>::type>::type retval;
+
+  if (we == 5 && is_half && !negative_zero_nan) {
+    retval = x << 8;
+    return reinterpret_cast<const T&>(retval);
+  }
+
+  const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+  // subnormal input
+  if (exponent == 0) {
+#if __HIP_DEVICE_COMPILE__
+    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+    int sh = 1 + __clz(mantissa) - (32 - wm);
+#else
+    int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+#endif
+    mantissa <<= sh;
+    exponent += 1 - sh;
+    mantissa &= ((1ull << wm) - 1);
+  }
+  exponent += exp_low_cutoff - 1;
+  mantissa <<= wmo - wm;
+
+  // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+  if (exponent <= 0) {
+    mantissa |= 1 << wmo;
+    mantissa >>= 1 - exponent;
+    exponent = 0;
+  }
+
+  if (sizeof(T) == 2)
+    retval = (sign << 15) | (exponent << 10) | mantissa;
+  else if (sizeof(T) == 4)
+    retval = (sign << 31) | (exponent << 23) | mantissa;
+  else
+    retval = (sign << 63) | (static_cast<unsigned long long>(exponent) << 52) | mantissa;
+  return reinterpret_cast<const T&>(retval);
+}
+
+#if HIP_FP8_CVT_FAST_PATH
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79
+template <bool stochastic_rounding = false>
+static __device__ __hip_fp8_storage_t cast_to_f8_from_f32(float v, bool saturate,
+                                                          __hip_fp8_interpretation_t interpret,
+                                                          unsigned int rng = 0) {
+  __hip_fp8_storage_t i8data;
+  union {
+    float fval;
+    unsigned int i32val;
+    unsigned char i8val[4];  // NOTE: not endian independent
+  } val;
+
+  unsigned int ival = 0;
+  val.fval = v;
+
+  if (saturate) {
+    if (interpret == __HIP_E4M3_FNUZ) {
+      if ((val.i32val & 0x7F800000) != 0x7F800000) {  /// propagate NAN/INF, no clipping
+        val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+      }
+    } else {
+      if ((val.i32val & 0x7F800000) != 0x7F800000) {  /// propagate NAN/INF, no clipping
+        val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+      }
+    }
+  }
+
+  if (stochastic_rounding) {
+    ival = interpret == __HIP_E4M3_FNUZ
+        ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
+        : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0);  // 0 pos
+    val.i32val = ival;
+    i8data = val.i8val[0];  // little endian
+  } else {                  // RNE CVT
+    ival = interpret == __HIP_E4M3_FNUZ
+        ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
+        : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false);  // false -> WORD0
+    val.i32val = ival;
+    i8data = val.i8val[0];
+  }
+  return i8data;
+}
+
+static __device__ __hip_fp8x2_storage_t
+cast_to_f8x2_from_f32x2(float2 v, bool saturate, __hip_fp8_interpretation_t interpret) {
+  union {
+    static_assert(sizeof(float2) == sizeof(unsigned int[2]));
+    static_assert(sizeof(float2) == sizeof(unsigned short[4]));
+    float2 fval;
+    unsigned int i32val[2];
+    unsigned short i16val[4];
+  } f2val;
+
+  f2val.fval = v;
+
+  if (saturate) {  /// propagate NAN/INF, no clipping
+    if ((f2val.i32val[0] & 0x7F800000) != 0x7F800000) {
+      f2val.fval.x = __builtin_amdgcn_fmed3f(f2val.fval.x, 240.0, -240.0);
+    }
+    if ((f2val.i32val[1] & 0x7F800000) != 0x7F800000) {
+      f2val.fval.y = __builtin_amdgcn_fmed3f(f2val.fval.x, 240.0, -240.0);
+    }
+  }
+
+  f2val.i32val[0] = interpret == __HIP_E4M3_FNUZ
+      ? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false)
+      : __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false);
+
+  return static_cast<__hip_fp8x2_storage_t>(f2val.i16val[0]);
+}
+
+static __device__ float cast_to_f32_from_f8(__hip_fp8_storage_t v,
+                                            __hip_fp8_interpretation_t interpret) {
+  union {
+    unsigned int i32val;
+    unsigned char i8val[4];
+  } val;
+  val.i8val[0] = v;
+
+  float fval = interpret == __HIP_E4M3_FNUZ ? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0)
+                                            : __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
+  return fval;
+}
+
+static __device__ float2 cast_to_f32x2_from_f8x2(__hip_fp8x2_storage_t v,
+                                                 __hip_fp8_interpretation_t interpret) {
+  union {
+    unsigned int i32val;
+    unsigned short i16val[2];
+  } val;
+  val.i16val[0] = v;
+
+  auto f2 = interpret == __HIP_E4M3_FNUZ ? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false)
+                                         : __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false);
+  return float2{f2[0], f2[1]};
+}
+#endif  // HIP_FP8_CVT_FAST_PATH
+
+/* For fp8 fnuz types, finite and NaN values are supported. Zero is unsigned.
+Inf are not supported. This gives us one additional number to represent.
+NaN are represented by 1-0000-000 or 1-00000-00 */
+__FP8_HOST_DEVICE_STATIC__ bool hip_fp8_fnuz_is_nan(__hip_fp8_storage_t a) {
+  return static_cast<unsigned char>(a) == 0x80;
+}
+}  // namespace internal
+
+/**
+ * \brief convert float to @p __hip_fp8_storage_t
+ *
+ * \param f float number
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t __hip_cvt_float_to_fp8(
+    const float f, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) {
+#if HIP_FP8_CVT_FAST_PATH
+  return internal::cast_to_f8_from_f32<false>(f, sat == __HIP_SATFINITE, type);
+#else   // HIP_FP8_CVT_FAST_PATH
+  int we = type == __HIP_E4M3_FNUZ ? 4 : 5;
+  int wm = type == __HIP_E4M3_FNUZ ? 3 : 2;
+  return internal::cast_to_f8<float, true>(f, wm, we, sat == __HIP_SATFINITE);
+#endif  // HIP_FP8_CVT_FAST_PATH
+}
+
+/**
+ * \brief convert float2 to @p __hip_fp8x2_storage_t
+ *
+ * \param f2 float2 number
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8x2_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t __hip_cvt_float2_to_fp8x2(
+    const float2 f2, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) {
+#if HIP_FP8_CVT_FAST_PATH
+  return internal::cast_to_f8x2_from_f32x2(f2, sat == __HIP_SATFINITE, type);
+#else
+  return static_cast<__hip_fp8x2_storage_t>(
+      static_cast<unsigned short int>(__hip_cvt_float_to_fp8(f2.y, sat, type)) << 8 |
+      static_cast<unsigned short int>(__hip_cvt_float_to_fp8(f2.x, sat, type)));
+#endif
+}
+
+/**
+ * \brief convert double to @p __hip_fp8_storage_t
+ *
+ * \param d double val
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t __hip_cvt_double_to_fp8(
+    const double d, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) {
+  int we = type == __HIP_E4M3_FNUZ ? 4 : 5;
+  int wm = type == __HIP_E4M3_FNUZ ? 3 : 2;
+  return internal::cast_to_f8<double, true>(d, wm, we, sat == __HIP_SATFINITE);
+}
+
+/**
+ * \brief convert double2 to @p __hip_fp8x2_storage_t
+ *
+ * \param d2 double2 val
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8x2_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t __hip_cvt_double2_to_fp8x2(
+    const double2 d2, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) {
+  return static_cast<__hip_fp8x2_storage_t>(
+      static_cast<unsigned short int>(__hip_cvt_double_to_fp8(d2.y, sat, type)) << 8 |
+      static_cast<unsigned short int>(__hip_cvt_double_to_fp8(d2.x, sat, type)));
+}
+
+/**
+ * \brief convert __hip_bfloat16_raw to @p __hip_fp8_storage_t
+ *
+ * \param hr __hip_bfloat16_raw val
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t
+__hip_cvt_bfloat16raw_to_fp8(const __hip_bfloat16_raw hr, const __hip_saturation_t sat,
+                             const __hip_fp8_interpretation_t type) {
+  float fval = __hip_bfloat16(hr);
+  return __hip_cvt_float_to_fp8(fval, sat, type);
+}
+
+/**
+ * \brief convert double2 to @p __hip_fp8x2_storage_t
+ *
+ * \param hr __hip_bfloat162_raw value
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8x2_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t
+__hip_cvt_bfloat16raw2_to_fp8x2(const __hip_bfloat162_raw hr, const __hip_saturation_t sat,
+                                const __hip_fp8_interpretation_t type) {
+  float2 f2 = __hip_bfloat162(hr);
+  return __hip_cvt_float2_to_fp8x2(f2, sat, type);
+}
+
+/**
+ * \brief convert @p __hip_fp8_storage_t to __half_raw
+ *
+ * \param x __hip_fp8_storage_t val
+ * \param type interpretation of fp8
+ * \return __half_raw
+ */
+__FP8_HOST_DEVICE_STATIC__ __half_raw
+__hip_cvt_fp8_to_halfraw(const __hip_fp8_storage_t x, const __hip_fp8_interpretation_t type) {
+  unsigned int we = type == __HIP_E4M3_FNUZ ? 4 : 5;
+  unsigned int wm = type == __HIP_E4M3_FNUZ ? 3 : 2;
+  return __half_raw{internal::cast_from_f8<_Float16, true>(x, wm, we)};
+}
+
+/**
+ * \brief convert @p __hip_fp8x2_storage_t to __half2_raw
+ *
+ * \param x __hip_fp8x2_storage_t val
+ * \param type interpretation of fp8
+ * \return __half2_raw
+ */
+__FP8_HOST_DEVICE_STATIC__ __half2_raw
+__hip_cvt_fp8x2_to_halfraw2(const __hip_fp8x2_storage_t x, const __hip_fp8_interpretation_t type) {
+  __half2 ret(static_cast<__half>(
+                  __hip_cvt_fp8_to_halfraw(static_cast<__hip_fp8_storage_t>(x & 0xFF), type)),
+              static_cast<__half>(
+                  __hip_cvt_fp8_to_halfraw(static_cast<__hip_fp8_storage_t>(x >> 8), type)));
+  return static_cast<__half2_raw>(ret);
+}
+
+/**
+ * \brief convert __half_raw to @p __hip_fp8_storage_t
+ *
+ * \param x __half_raw value
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t __hip_cvt_halfraw_to_fp8(
+    const __half_raw x, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) {
+  return __hip_cvt_float_to_fp8(__half2float(__half(x)), sat, type);
+}
+
+/**
+ * \brief convert __half2_raw to @p __hip_fp8x2_storage_t
+ *
+ * \param x __half2_raw value
+ * \param sat saturation of fp8
+ * \param type interpretation of fp8
+ * \return __hip_fp8x2_storage_t
+ */
+__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t __hip_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) {
+  return __hip_cvt_float2_to_fp8x2(__half22float2(__half2(x)), sat, type);
+}
+
+/**
+ * \brief struct representing single fp8 number with e4m3 interpretation
+ *
+ */
+struct __hip_fp8_e4m3_fnuz {
+  __hip_fp8_storage_t __x;  //! raw storage of fp8 number
+  constexpr static __hip_saturation_t __default_saturation = __HIP_SATFINITE;
+  constexpr static __hip_fp8_interpretation_t __default_interpret = __HIP_E4M3_FNUZ;
+  constexpr static unsigned int __we = 4;
+  constexpr static unsigned int __wm = 3;
+
+  // TODO: SWDEV-452411
+  // Add cast from unsigned long long, long long to fp8
+
+  /*! create fp8 e4m3 from long */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const long int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from int */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from short int */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const short int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from unsigned long */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const unsigned long int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from unsigned int */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const unsigned int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from unsigned short */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const unsigned short int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from double */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const double f)
+      : __x(__hip_cvt_double_to_fp8(f, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8 e4m3 from float */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const float f)
+      : __x(__hip_cvt_float_to_fp8(f, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8 e4m3 from __hip_bfloat16 */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const __hip_bfloat16 f)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(f), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e4m3 from __half */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const __half f)
+      : __x(__hip_cvt_halfraw_to_fp8(static_cast<__half_raw>(f), __default_saturation,
+                                     __default_interpret)) {}
+
+  /*! default construct fp8 e4m3 */
+  __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz() = default;
+
+  /*! convert fp8 e4m3 to __half */
+  __FP8_HOST_DEVICE__ operator __half() const {
+    return __half(__hip_cvt_fp8_to_halfraw(__x, __default_interpret));
+  }
+
+  /*! convert fp8 e4m3 to __hip_bfloat16 */
+  __FP8_HOST_DEVICE__ operator __hip_bfloat16() const {
+    float f = *this;
+    return __hip_bfloat16(f);
+  }
+
+  /*! convert fp8 e4m3 to bool, return false if value is 0, true otherwise */
+  __FP8_HOST_DEVICE__ operator bool() const {
+    // it can be 0x00 (+0.0) since 0x80 will be nan
+    return !(static_cast<unsigned short>(__x) == 0);
+  }
+
+  /*! convert fp8 e4m3 to char, clamp number to CHAR_MIN/CHAR_MAX if its out of range */
+  __FP8_HOST_DEVICE__ operator char() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    auto fval = internal::cast_from_f8<float, true>(__x, __wm, __we);
+    auto llval = static_cast<long long>(fval);
+    if (llval <= CHAR_MIN) {
+      return CHAR_MIN;
+    } else if (llval >= CHAR_MAX) {
+      return CHAR_MAX;
+    }
+    return static_cast<char>(fval);
+  }
+
+  /*! convert fp8 e4m3 to double */
+  __FP8_HOST_DEVICE__ operator double() const {
+    return internal::cast_from_f8<double, true>(__x, __wm, __we);
+  }
+
+  /*! convert fp8 e4m3 to float */
+  __FP8_HOST_DEVICE__ operator float() const {
+#if HIP_FP8_CVT_FAST_PATH
+    return internal::cast_to_f32_from_f8(__x, __default_interpret);
+#else
+    return internal::cast_from_f8<float, true>(__x, __wm, __we);
+#endif
+  }
+
+  /*! convert fp8 e4m3 to int, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    return static_cast<int>(fval);
+  }
+
+  /*! convert fp8 e4m3 to long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    return static_cast<long>(fval);
+  }
+
+  /*! convert fp8 e4m3 to long long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator long long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    return static_cast<long long>(fval);
+  }
+
+  /*! convert fp8 e4m3 to short int, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator short int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= SHRT_MIN) {
+      return SHRT_MIN;
+    } else if (llval >= SHRT_MAX) {
+      return SHRT_MAX;
+    }
+    return static_cast<short>(fval);
+  }
+
+  /*! convert fp8 e4m3 to signed char, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator signed char() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= SCHAR_MIN) {
+      return SCHAR_MIN;
+    } else if (llval >= SCHAR_MAX) {
+      return SCHAR_MAX;
+    }
+    return static_cast<signed char>(fval);
+  }
+
+  /*! convert fp8 e4m3 to unsigned char, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned char() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    } else if (llval >= UCHAR_MAX) {
+      return UCHAR_MAX;
+    }
+    return static_cast<unsigned char>(fval);
+  }
+
+  /*! convert fp8 e4m3 to unsigned int, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned int>(fval);
+  }
+
+  /*! convert fp8 e4m3 to unsigned long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned long>(fval);
+  }
+
+  /*! convert fp8 e4m3 to long long int, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned long long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned long long>(fval);
+  }
+
+  /*! convert fp8 e4m3 to unsigned short, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned short int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned short>(fval);
+  }
+};
+
+/**
+ * \brief struct representing two fp8 numbers with e4m3 interpretation
+ *
+ */
+struct __hip_fp8x2_e4m3_fnuz {
+  __hip_fp8x2_storage_t __x;  //! raw storage of two fp8 numbers
+  static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE;
+  static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E4M3_FNUZ;
+  static constexpr unsigned int __we = 4;
+  static constexpr unsigned int __wm = 3;
+
+  /*! create fp8x2 e4m3 type from double2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const double2 val)
+      : __x(__hip_cvt_double2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8x2 e4m3 type from float2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const float2 val)
+      : __x(__hip_cvt_float2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8x2 e4m3 type from __hip_bfloat162 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const __hip_bfloat162 val)
+      : __x(__hip_cvt_bfloat16raw2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8x2 e4m3 type from __half2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const __half2 val)
+      : __x(__hip_cvt_halfraw2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! Default construct of fp8x2 e4m3 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz() = default;
+
+  /*! convert fp8x2 e4m3 to __half2 */
+  __FP8_HOST_DEVICE__ operator __half2() const {
+    return __half2(__hip_cvt_fp8x2_to_halfraw2(__x, __default_interpret));
+  }
+
+  /*! convert fp8x2 e4m3 to float2 */
+  __FP8_HOST_DEVICE__ operator float2() const {
+#if HIP_FP8_CVT_FAST_PATH
+    return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
+#else
+    return float2(internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x & 0xFF),
+                                                      __wm, __we),
+                  internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x >> 8),
+                                                      __wm, __we));
+#endif
+  }
+};
+
+/**
+ * \brief struct representing four fp8 numbers with e4m3 interpretation
+ *
+ */
+struct __hip_fp8x4_e4m3_fnuz {
+  __hip_fp8x4_storage_t __x;  //! raw storage of four fp8 numbers
+  static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE;
+  static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E4M3_FNUZ;
+  static constexpr unsigned int __we = 4;
+  static constexpr unsigned int __wm = 3;
+
+  /*! create fp8x4 e4m3 type from double4 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const double4 val)
+      : __x{reinterpret_cast<__hip_fp8x4_storage_t>(
+            static_cast<unsigned int>(reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.x, __default_saturation, __default_interpret)) |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.y, __default_saturation, __default_interpret))
+                                          << 8 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.z, __default_saturation, __default_interpret))
+                                          << 16 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.w, __default_saturation, __default_interpret))
+                                          << 24))} {}
+
+  /*! create fp8x4 e4m3 type from float4 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const float4 val)
+      : __x{reinterpret_cast<__hip_fp8x4_storage_t>(
+            static_cast<unsigned int>(reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.x, __default_saturation, __default_interpret)) |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.y, __default_saturation, __default_interpret))
+                                          << 8 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.z, __default_saturation, __default_interpret))
+                                          << 16 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.w, __default_saturation, __default_interpret))
+                                          << 24))} {}
+
+  /*! create fp8x4 e4m3 type from two __hip_bfloat162 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const __hip_bfloat162 low, const __hip_bfloat162 high)
+      : __x(reinterpret_cast<__hip_fp8x4_storage_t>(static_cast<unsigned int>(
+            reinterpret_cast<unsigned short>(
+                __hip_cvt_bfloat16raw2_to_fp8x2(high, __default_saturation, __default_interpret)) |
+            reinterpret_cast<unsigned short>(
+                __hip_cvt_bfloat16raw2_to_fp8x2(low, __default_saturation, __default_interpret))
+                << 16))) {}
+
+  /*! create fp8x4 e4m3 type from two __half2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const __half2 low, const __half2 high)
+      : __x(reinterpret_cast<__hip_fp8x4_storage_t>(
+            static_cast<unsigned int>(reinterpret_cast<unsigned short>(__hip_cvt_halfraw2_to_fp8x2(
+                                          high, __default_saturation, __default_interpret)) |
+                                      reinterpret_cast<unsigned short>(__hip_cvt_halfraw2_to_fp8x2(
+                                          low, __default_saturation, __default_interpret))
+                                          << 16))) {}
+
+  /*! Default construct fp8x4 e4m3 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz() = default;
+
+  /*! convert fp8x4 e4m3 to float4 */
+  __FP8_HOST_DEVICE__ operator float4() const {
+    auto x = __x;                                                    // bypass const
+    auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x);  // Little E
+    auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
+#if HIP_FP8_CVT_FAST_PATH
+    float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
+    float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
+#else
+    float2 high = float2(internal::cast_from_f8<float, true>(
+                             static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we),
+                         internal::cast_from_f8<float, true>(
+                             static_cast<__hip_fp8_storage_t>(fp8x2_high >> 8), __wm, __we));
+    float2 low = float2(internal::cast_from_f8<float, true>(
+                            static_cast<__hip_fp8_storage_t>((fp8x2_low << 8) >> 8), __wm, __we),
+                        internal::cast_from_f8<float, true>(
+                            static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we));
+#endif
+    return float4(low.x, low.y, high.x, high.y);
+  }
+};
+
+/**
+ * \brief struct representing one fp8 number with e5m2 interpretation
+ *
+ */
+struct __hip_fp8_e5m2_fnuz {
+  __hip_fp8_storage_t __x;  //! raw storage of one fp8 numbers
+  static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE;
+  static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E5M2_FNUZ;
+  static constexpr unsigned int __we = 5;
+  static constexpr unsigned int __wm = 2;
+
+
+  // TODO: SWDEV-452411
+  // Add cast from unsigned long long, long long to fp8
+
+  /*! create fp8 e5m2 type from long */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const long int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from int */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from short int */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const short int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from unsigned long */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const unsigned long int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from unsigned int */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const unsigned int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from unsigned short */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const unsigned short int val)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(val), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from double */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const double f)
+      : __x(__hip_cvt_double_to_fp8(f, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from float */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const float f)
+      : __x(__hip_cvt_float_to_fp8(f, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from __hip_bfloat16 */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const __hip_bfloat16 f)
+      : __x(__hip_cvt_float_to_fp8(static_cast<float>(f), __default_saturation,
+                                   __default_interpret)) {}
+
+  /*! create fp8 e5m2 type from __hip_bfloat16 */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const __half f)
+      : __x(__hip_cvt_halfraw_to_fp8(static_cast<__half_raw>(f), __default_saturation,
+                                     __default_interpret)) {}
+
+  /*! default construct fp8 e5m2 */
+  __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz() = default;
+
+  /*! convert fp8 e5m2 to float */
+  __FP8_HOST_DEVICE__ operator float() const {
+#if HIP_FP8_CVT_FAST_PATH
+    return internal::cast_to_f32_from_f8(__x, __default_interpret);
+#else
+    return internal::cast_from_f8<float, true>(__x, __wm, __we);
+#endif
+  }
+
+  /*! convert fp8 e5m2 to __half */
+  __FP8_HOST_DEVICE__ operator __half() const {
+    return __half(__hip_cvt_fp8_to_halfraw(__x, __default_interpret));
+  }
+
+  /*! convert fp8 e5m2 to __hip_bfloat16 */
+  __FP8_HOST_DEVICE__ operator __hip_bfloat16() const {
+    float f = *this;
+    return __hip_bfloat16(f);
+  }
+
+  /*! convert fp8 e4m3 to bool, return false if value is 0, true otherwise */
+  __FP8_HOST_DEVICE__ operator bool() const {
+    // it can be 0x00 (+0.0) since 0x80 will be nan
+    return !(static_cast<unsigned short>(__x) == 0);
+  }
+
+  /*! convert fp8 e5m2 to char, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator char() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= CHAR_MIN) {
+      return CHAR_MIN;
+    } else if (llval >= CHAR_MAX) {
+      return CHAR_MAX;
+    }
+    return static_cast<char>(fval);
+  }
+
+  /*! convert fp8 e5m2 to double */
+  __FP8_HOST_DEVICE__ operator double() const {
+    return internal::cast_from_f8<double, true>(__x, __wm, __we);
+  }
+
+  /*! convert fp8 e5m2 to int, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    return static_cast<int>(fval);
+  }
+
+  /*! convert fp8 e5m2 to long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    return static_cast<long>(fval);
+  }
+
+  /*! convert fp8 e5m2 to long long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator long long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    return static_cast<long long>(fval);
+  }
+
+  /*! convert fp8 e5m2 to short, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator short int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= SHRT_MIN) {
+      return SHRT_MIN;
+    } else if (llval >= SHRT_MAX) {
+      return SHRT_MAX;
+    }
+    return static_cast<short>(fval);
+  }
+
+  /*! convert fp8 e5m2 to signed char, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator signed char() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= SCHAR_MIN) {
+      return SCHAR_MIN;
+    } else if (llval >= SCHAR_MAX) {
+      return SCHAR_MAX;
+    }
+    return static_cast<signed char>(fval);
+  }
+
+  /*! convert fp8 e5m2 to unsigned char, clamp out of bound values, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned char() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    } else if (llval >= UCHAR_MAX) {
+      return UCHAR_MAX;
+    }
+    return static_cast<unsigned char>(fval);
+  }
+
+  /*! convert fp8 e5m2 to unsigned int, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned int>(fval);
+  }
+
+  /*! convert fp8 e5m2 to unsigned long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned long>(fval);
+  }
+
+  /*! convert fp8 e5m2 to unsigned long long, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned long long int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned long long>(fval);
+  }
+
+  /*! convert fp8 e5m2 to unsigned short, return 0 if value is NaN */
+  __FP8_HOST_DEVICE__ operator unsigned short int() const {
+    if (internal::hip_fp8_fnuz_is_nan(__x)) {
+      return 0;
+    }
+
+    float fval = *this;
+    auto llval = static_cast<long long>(fval);
+    if (llval <= 0) {
+      return 0;
+    }
+    return static_cast<unsigned short>(fval);
+  }
+};
+
+/**
+ * \brief struct representing two fp8 numbers with e5m2 interpretation
+ *
+ */
+struct __hip_fp8x2_e5m2_fnuz {
+  __hip_fp8x2_storage_t __x;  //! raw storage of two fp8 numbers
+  static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE;
+  static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E5M2_FNUZ;
+  static constexpr unsigned int __we = 5;
+  static constexpr unsigned int __wm = 2;
+
+  /*! create fp8x2 e5m2 type from double2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const double2 val)
+      : __x(__hip_cvt_double2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8x2 e5m2 type from float2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const float2 val)
+      : __x(__hip_cvt_float2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8x2 e5m2 type from __hip_bfloat162 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const __hip_bfloat162 val)
+      : __x(__hip_cvt_bfloat16raw2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! create fp8x2 e5m2 type from __half2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const __half2 val)
+      : __x(__hip_cvt_halfraw2_to_fp8x2(val, __default_saturation, __default_interpret)) {}
+
+  /*! default construct fp8x2 e5m2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz() = default;
+
+  /*! convert fp8x2 e5m2 to __half2 */
+  __FP8_HOST_DEVICE__ operator __half2() const {
+    return __half2(__hip_cvt_fp8x2_to_halfraw2(__x, __default_interpret));
+  }
+
+  /*! convert fp8x2 e5m2 to float2 */
+  __FP8_HOST_DEVICE__ operator float2() const {
+#if HIP_FP8_CVT_FAST_PATH
+    return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
+#else
+    return float2(internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x & 0xFF),
+                                                      __wm, __we),
+                  internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x >> 8),
+                                                      __wm, __we));
+#endif
+  }
+};
+
+/**
+ * \brief struct representing four fp8 numbers with e5m2 interpretation
+ *
+ */
+struct __hip_fp8x4_e5m2_fnuz {
+  __hip_fp8x4_storage_t __x;  //! raw storage of four fp8 numbers
+  static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE;
+  static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E5M2_FNUZ;
+  static constexpr unsigned int __we = 5;
+  static constexpr unsigned int __wm = 2;
+
+  /*! create fp8x4 e5m2 type from double4 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const double4 val)
+      : __x(reinterpret_cast<__hip_fp8x4_storage_t>(
+            static_cast<unsigned int>(reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.x, __default_saturation, __default_interpret)) |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.y, __default_saturation, __default_interpret))
+                                          << 8 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.z, __default_saturation, __default_interpret))
+                                          << 16 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
+                                          val.w, __default_saturation, __default_interpret))
+                                          << 24))) {}
+
+  /*! create fp8x4 e5m2 type from float4 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const float4 val)
+      : __x(reinterpret_cast<__hip_fp8x4_storage_t>(
+            static_cast<unsigned int>(reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.x, __default_saturation, __default_interpret)) |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.y, __default_saturation, __default_interpret))
+                                          << 8 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.z, __default_saturation, __default_interpret))
+                                          << 16 |
+                                      reinterpret_cast<unsigned char>(__hip_cvt_float_to_fp8(
+                                          val.w, __default_saturation, __default_interpret))
+                                          << 24))) {}
+
+  /*! create fp8x4 e5m2 type from two __hip_bfloat162 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const __hip_bfloat162 low, const __hip_bfloat162 high)
+      : __x(reinterpret_cast<__hip_fp8x4_storage_t>(static_cast<unsigned int>(
+            reinterpret_cast<unsigned short>(
+                __hip_cvt_bfloat16raw2_to_fp8x2(high, __default_saturation, __default_interpret)) |
+            reinterpret_cast<unsigned short>(
+                __hip_cvt_bfloat16raw2_to_fp8x2(low, __default_saturation, __default_interpret))
+                << 16))) {}
+
+  /*! create fp8x4 e5m2 type from two __half2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const __half2 low, const __half2 high)
+      : __x(reinterpret_cast<__hip_fp8x4_storage_t>(
+            static_cast<unsigned int>(reinterpret_cast<unsigned short>(__hip_cvt_halfraw2_to_fp8x2(
+                                          high, __default_saturation, __default_interpret)) |
+                                      reinterpret_cast<unsigned short>(__hip_cvt_halfraw2_to_fp8x2(
+                                          low, __default_saturation, __default_interpret))
+                                          << 16))) {}
+
+  /* default construct fp8x4 e5m2 */
+  __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz() = default;
+
+  /*! convert fp8x4 e5m2 to float4 */
+  __FP8_HOST_DEVICE__ operator float4() const {
+    auto x = __x;                                                    // bypass const
+    auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x);  // Little E
+    auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
+#if HIP_FP8_CVT_FAST_PATH
+    float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
+    float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
+#else
+    float2 high = float2(internal::cast_from_f8<float, true>(
+                             static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we),
+                         internal::cast_from_f8<float, true>(
+                             static_cast<__hip_fp8_storage_t>(fp8x2_high >> 8), __wm, __we));
+    float2 low = float2(internal::cast_from_f8<float, true>(
+                            static_cast<__hip_fp8_storage_t>((fp8x2_low << 8) >> 8), __wm, __we),
+                        internal::cast_from_f8<float, true>(
+                            static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we));
+#endif
+    return float4(low.x, low.y, high.x, high.y);
+  }
+};
+
+#endif  // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
index 9d26d6f36..8ef0b2e1d 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
@@ -56,6 +56,12 @@ T __hip_readfirstlane(T val) {
   return u.d;
 }
 
+// When compiling for wave32 mode, ignore the upper half of the 64-bit mask.
+#define __hip_adjust_mask_for_wave32(MASK)            \
+  do {                                          \
+    if (warpSize == 32) MASK &= 0xFFFFFFFF;     \
+  } while (0)
+
 // We use a macro to expand each builtin into a waterfall that implements the
 // mask semantics:
 //
@@ -125,6 +131,7 @@ unsigned long long __ballot_sync(MaskT mask, int predicate) {
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   __hip_check_mask(mask);
   return __ballot(predicate) & mask;
 }
@@ -136,6 +143,7 @@ int __all_sync(MaskT mask, int predicate) {
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   return __ballot_sync(mask, predicate) == mask;
 }
 
@@ -146,6 +154,7 @@ int __any_sync(MaskT mask, int predicate) {
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   return __ballot_sync(mask, predicate) != 0;
 }
 
@@ -182,6 +191,7 @@ unsigned long long __match_any_sync(MaskT mask, T value) {
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   __hip_check_mask(mask);
   return __match_any(value) & mask;
 }
@@ -212,6 +222,7 @@ unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
   MaskT retval = 0;
+  __hip_adjust_mask_for_wave32(mask);
   __hip_do_sync(retval, __match_all, mask, value, pred);
   return retval;
 }
@@ -226,6 +237,7 @@ T __shfl_sync(MaskT mask, T var, int srcLane,
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   __hip_check_mask(mask);
   return __shfl(var, srcLane, width);
 }
@@ -238,6 +250,7 @@ T __shfl_up_sync(MaskT mask, T var, unsigned int delta,
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   __hip_check_mask(mask);
   return __shfl_up(var, delta, width);
 }
@@ -250,6 +263,7 @@ T __shfl_down_sync(MaskT mask, T var, unsigned int delta,
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   __hip_check_mask(mask);
   return __shfl_down(var, delta, width);
 }
@@ -262,11 +276,13 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask,
       __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
       "The mask must be a 64-bit integer. "
       "Implicitly promoting a smaller integer is almost always an error.");
+  __hip_adjust_mask_for_wave32(mask);
   __hip_check_mask(mask);
   return __shfl_xor(var, laneMask, width);
 }
 
 #undef __hip_do_sync
 #undef __hip_check_mask
+#undef __hip_adjust_mask_for_wave32
 
 #endif // HIP_ENABLE_WARP_SYNC_BUILTINS
diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp
index 96031d3b4..957ea7562 100644
--- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp
+++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp
@@ -954,6 +954,18 @@ typedef hipError_t (*t_hipStreamBeginCaptureToGraph)(hipStream_t stream, hipGrap
                                                      hipStreamCaptureMode mode);
 typedef hipError_t (*t_hipGetFuncBySymbol)(hipFunction_t* functionPtr, const void* symbolPtr);
 
+typedef hipError_t (*t_hipDrvGraphAddMemFreeNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                  const hipGraphNode_t* dependencies, size_t numDependencies,
+                                  hipDeviceptr_t dptr);
+
+typedef hipError_t (*t_hipDrvGraphExecMemcpyNodeSetParams)(hipGraphExec_t hGraphExec,
+                                   hipGraphNode_t hNode, const HIP_MEMCPY3D* copyParams,
+                                   hipCtx_t ctx);
+
+typedef hipError_t (*t_hipDrvGraphExecMemsetNodeSetParams)(hipGraphExec_t hGraphExec,
+                                   hipGraphNode_t hNode, const HIP_MEMSET_NODE_PARAMS* memsetParams,
+                                   hipCtx_t ctx);
+
 // HIP Compiler dispatch table
 struct HipCompilerDispatchTable {
   size_t size;
@@ -1420,4 +1432,7 @@ struct HipDispatchTable {
   t_hipGetProcAddress hipGetProcAddress_fn;
   t_hipStreamBeginCaptureToGraph hipStreamBeginCaptureToGraph_fn;
   t_hipGetFuncBySymbol hipGetFuncBySymbol_fn;
+  t_hipDrvGraphAddMemFreeNode hipDrvGraphAddMemFreeNode_fn;
+  t_hipDrvGraphExecMemcpyNodeSetParams hipDrvGraphExecMemcpyNodeSetParams_fn;
+  t_hipDrvGraphExecMemsetNodeSetParams hipDrvGraphExecMemsetNodeSetParams_fn;
 };
diff --git a/hipamd/include/hip/amd_detail/host_defines.h b/hipamd/include/hip/amd_detail/host_defines.h
index 0fad2b470..e7e836496 100644
--- a/hipamd/include/hip/amd_detail/host_defines.h
+++ b/hipamd/include/hip/amd_detail/host_defines.h
@@ -127,6 +127,10 @@ template<typename _Tp>
     struct is_trivial
     : public integral_constant<bool, __is_trivial(_Tp)>
     { };
+
+
+template <bool B, class T, class F> struct conditional { using type = T; };
+template <class T, class F> struct conditional<false, T, F> { using type = F; };
 }
 typedef __hip_internal::uint8_t __hip_uint8_t;
 typedef __hip_internal::uint16_t __hip_uint16_t;
diff --git a/hipamd/packaging/CMakeLists.txt b/hipamd/packaging/CMakeLists.txt
index 34d6a9633..8f1a1268f 100644
--- a/hipamd/packaging/CMakeLists.txt
+++ b/hipamd/packaging/CMakeLists.txt
@@ -74,19 +74,25 @@ endif()#End HIP_PLATFORM = "amd"
 #End bianry files install
 
 #Begin dev files install
-if(WIN32)
-  install(DIRECTORY ${HIP_COMMON_DIR}/bin DESTINATION . COMPONENT dev
-          USE_SOURCE_PERMISSIONS)
-else()
-  install(DIRECTORY ${HIP_COMMON_DIR}/bin DESTINATION .  COMPONENT dev
-          USE_SOURCE_PERMISSIONS
-          DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
-          PATTERN *.bat EXCLUDE)
+#Install bin files from HIP_COMMON_DIR
+file(GLOB BIN_FILES ${HIP_COMMON_DIR}/bin/*)
+if(NOT WIN32)
+    list(FILTER BIN_FILES EXCLUDE REGEX ".bat$")
+endif()
+foreach(binFile ${BIN_FILES})
+    install(PROGRAMS ${binFile} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT dev)
+endforeach()
+unset(BIN_FILES)
+
+#Install bin files from hip_SOURCE_DIR
+file(GLOB BIN_FILES ${hip_SOURCE_DIR}/bin/*)
+if(NOT WIN32)
+    list(FILTER BIN_FILES EXCLUDE REGEX ".bat$")
 endif()
+foreach(binFile ${BIN_FILES})
+    install(PROGRAMS ${binFile} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT dev)
+endforeach()
 
-install(DIRECTORY ${hip_SOURCE_DIR}/bin DESTINATION . COMPONENT dev
-        USE_SOURCE_PERMISSIONS
-        DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
 install(DIRECTORY ${HIP_COMMON_DIR}/include DESTINATION . COMPONENT dev)
 install(DIRECTORY ${hip_SOURCE_DIR}/include/hip/amd_detail
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def
index cfc3ae6b5..c1017af7a 100644
--- a/hipamd/src/amdhip.def
+++ b/hipamd/src/amdhip.def
@@ -463,3 +463,6 @@ hipGraphAddNode
 hipGraphInstantiateWithParams
 hipStreamBeginCaptureToGraph
 hipGetFuncBySymbol
+hipDrvGraphAddMemFreeNode
+hipDrvGraphExecMemcpyNodeSetParams
+hipDrvGraphExecMemsetNodeSetParams
diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp
index 1479ae150..05598dd7f 100644
--- a/hipamd/src/hip_api_trace.cpp
+++ b/hipamd/src/hip_api_trace.cpp
@@ -768,6 +768,13 @@ hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
                                         const hipGraphEdgeData* dependencyData,
                                         size_t numDependencies, hipStreamCaptureMode mode);
 hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr);
+hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                  const hipGraphNode_t* dependencies, size_t numDependencies,
+                                  hipDeviceptr_t dptr);
+hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                   const HIP_MEMCPY3D* copyParams, hipCtx_t ctx);
+hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                   const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx);
 }  // namespace hip
 
 namespace hip {
@@ -1244,6 +1251,9 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
   ptrDispatchTable->hipGetProcAddress_fn = hip::hipGetProcAddress;
   ptrDispatchTable->hipStreamBeginCaptureToGraph_fn = hip::hipStreamBeginCaptureToGraph;
   ptrDispatchTable->hipGetFuncBySymbol_fn = hip::hipGetFuncBySymbol;
+  ptrDispatchTable->hipDrvGraphAddMemFreeNode_fn = hip::hipDrvGraphAddMemFreeNode;
+  ptrDispatchTable->hipDrvGraphExecMemcpyNodeSetParams_fn = hip::hipDrvGraphExecMemcpyNodeSetParams;
+  ptrDispatchTable->hipDrvGraphExecMemsetNodeSetParams_fn = hip::hipDrvGraphExecMemsetNodeSetParams;
 }
 
 #if HIP_ROCPROFILER_REGISTER > 0
@@ -1806,7 +1816,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipTexRefGetArray_fn, 441)
 HIP_ENFORCE_ABI(HipDispatchTable, hipGetProcAddress_fn, 442)
 HIP_ENFORCE_ABI(HipDispatchTable, hipStreamBeginCaptureToGraph_fn, 443);
 HIP_ENFORCE_ABI(HipDispatchTable, hipGetFuncBySymbol_fn, 444);
-
+HIP_ENFORCE_ABI(HipDispatchTable, hipDrvGraphAddMemFreeNode_fn, 445)
+HIP_ENFORCE_ABI(HipDispatchTable, hipDrvGraphExecMemcpyNodeSetParams_fn, 446)
+HIP_ENFORCE_ABI(HipDispatchTable, hipDrvGraphExecMemsetNodeSetParams_fn, 447)
 
 // if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below
 // will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.:
@@ -1814,7 +1826,7 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipGetFuncBySymbol_fn, 444);
 //  HIP_ENFORCE_ABI(<table>, <functor>, 8)
 //
 //  HIP_ENFORCE_ABI_VERSIONING(<table>, 9) <- 8 + 1 = 9
-HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 445)
+HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 448)
 
 static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 3,
               "If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function "
diff --git a/hipamd/src/hip_code_object.cpp b/hipamd/src/hip_code_object.cpp
index 6b3a6d08f..477e9a811 100644
--- a/hipamd/src/hip_code_object.cpp
+++ b/hipamd/src/hip_code_object.cpp
@@ -30,21 +30,34 @@ THE SOFTWARE.
 #include "hip_internal.hpp"
 #include "platform/program.hpp"
 #include <elf/elf.hpp>
+#include "comgrctx.hpp"
+
 namespace hip {
 hipError_t ihipFree(void* ptr);
 // forward declaration of methods required for managed variables
 hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
 namespace {
-constexpr char kOffloadBundleMagicStr[] = "__CLANG_OFFLOAD_BUNDLE__";
+// In uncompressed mode
+constexpr char kOffloadBundleUncompressedMagicStr[] = "__CLANG_OFFLOAD_BUNDLE__";
+static constexpr size_t kOffloadBundleUncompressedMagicStrSize =
+    sizeof(kOffloadBundleUncompressedMagicStr);
+
+//In compressed mode
+constexpr char kOffloadBundleCompressedMagicStr[] = "CCOB";
+static constexpr size_t kOffloadBundleCompressedMagicStrSize =
+    sizeof(kOffloadBundleCompressedMagicStr);
+
 constexpr char kOffloadKindHip[] = "hip";
 constexpr char kOffloadKindHipv4[] = "hipv4";
 constexpr char kOffloadKindHcc[] = "hcc";
 constexpr char kAmdgcnTargetTriple[] = "amdgcn-amd-amdhsa-";
-
+constexpr char kHipFatBinName[] = "hipfatbin";
+constexpr char kHipFatBinName_[] = "hipfatbin-";
+constexpr char kOffloadKindHipv4_[] = "hipv4-";  // bundled code objects need the prefix
+constexpr char kOffloadHipV4FatBinName_[] = "hipfatbin-hipv4-";
 // ClangOFFLOADBundle info.
-static constexpr size_t kOffloadBundleMagicStrSize = sizeof(kOffloadBundleMagicStr);
 
-// Clang Offload bundler description & Header.
+// Clang Offload bundler description & Header in uncompressed mode.
 struct __ClangOffloadBundleInfo {
   uint64_t offset;
   uint64_t size;
@@ -52,16 +65,37 @@ struct __ClangOffloadBundleInfo {
   const char bundleEntryId[1];
 };
 
-struct __ClangOffloadBundleHeader {
-  const char magic[kOffloadBundleMagicStrSize - 1];
+struct __ClangOffloadBundleUncompressedHeader {
+  const char magic[kOffloadBundleUncompressedMagicStrSize - 1];
   uint64_t numOfCodeObjects;
   __ClangOffloadBundleInfo desc[1];
 };
+
+struct __ClangOffloadBundleCompressedHeader {
+  const char magic[kOffloadBundleCompressedMagicStrSize - 1];
+  uint16_t versionNumber;
+  uint16_t compressionMethod;
+  uint32_t totalSize;
+  uint32_t uncompressedBinarySize;
+  uint64_t Hash;
+  const char compressedBinarydesc[1];
+};
 }  // namespace
 
-bool CodeObject::IsClangOffloadMagicBundle(const void* data) {
-  std::string magic(reinterpret_cast<const char*>(data), kOffloadBundleMagicStrSize - 1);
-  return magic.compare(kOffloadBundleMagicStr) ? false : true;
+bool CodeObject::IsClangOffloadMagicBundle(const void* data, bool &isCompressed) {
+  std::string magic(reinterpret_cast<const char*>(data),
+    kOffloadBundleUncompressedMagicStrSize - 1);
+  if (!magic.compare(kOffloadBundleUncompressedMagicStr)) {
+    isCompressed = false;
+    return true;
+  }
+  std::string magic1(reinterpret_cast<const char*>(data),
+                      kOffloadBundleCompressedMagicStrSize - 1);
+  if (!magic1.compare(kOffloadBundleCompressedMagicStr)) {
+    isCompressed = true;
+    return true;
+  }
+  return false;
 }
 
 uint64_t CodeObject::ElfSize(const void* emi) { return amd::Elf::getElfSize(emi); }
@@ -356,7 +390,7 @@ static bool consume(std::string& input, std::string consume_) {
 
 // Trim String till character, will be used to get gpuname
 // example: input is gfx908:sram-ecc+ and trim char is :
-// input will become sram-ecc+.
+// input will become :sram-ecc+
 static std::string trimName(std::string& input, char trim) {
   auto pos_ = input.find(trim);
   auto res = input;
@@ -369,6 +403,18 @@ static std::string trimName(std::string& input, char trim) {
   return res;
 }
 
+// Trim String till character, will be used to get bundle entry ID.
+// example: input is amdgcn-amd-amdhsa--gfx1035.bc and trim char is .
+// input will become amdgcn-amd-amdhsa--gfx1035
+static bool trimNameTail(std::string& input, char trim) {
+  auto pos_ = input.rfind(trim);
+  if (pos_ == std::string::npos) {
+    return false;
+  }
+  input = input.substr(0, pos_);
+  return true;
+}
+
 static char getFeatureValue(std::string& input, std::string feature) {
   char res = ' ';
   if (consume(input, std::move(feature))) {
@@ -447,111 +493,353 @@ static bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
   return true;
 }
 
-// This will be moved to COMGR eventually
-hipError_t CodeObject::ExtractCodeObjectFromFile(
-    amd::Os::FileDesc fdesc, size_t fsize, const void** image,
-    const std::vector<std::string>& device_names,
-    std::vector<std::pair<const void*, size_t>>& code_objs) {
-  if (!amd::Os::isValidFileDesc(fdesc)) {
-    return hipErrorFileNotFound;
+size_t CodeObject::getFatbinSize(const void* data, const bool isCompressed) {
+  if (isCompressed) {
+    const auto obheader = reinterpret_cast<const __ClangOffloadBundleCompressedHeader*>(data);
+    return obheader->totalSize;
+  } else {
+    const auto obheader = reinterpret_cast<const __ClangOffloadBundleUncompressedHeader*>(data);
+    const __ClangOffloadBundleInfo* desc = &obheader->desc[0];
+    uint64_t i = 0;
+    while (++i < obheader->numOfCodeObjects) {
+      desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
+          reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) + desc->bundleEntryIdSize);
+    }
+    return desc->offset + desc->size;
   }
+}
 
-  // Map the file to memory, with offset 0.
-  // file will be unmapped in ModuleUnload
-  // const void* image = nullptr;
-  if (!amd::Os::MemoryMapFileDesc(fdesc, fsize, 0, image)) {
-    return hipErrorInvalidValue;
+/**
+ *  @brief Extract code object from fatbin using comgr
+ *
+ *  @param[in]  data the bundle data(fatbin or loaded module data)
+ *  @param[in]  size the size of the bundle data
+ *  @param[in]  agent_triple_target_ids isa names of concerned devices
+ *  @param[out] code_objs the buffer address and size pairs of extracted code objects of
+ *              concerned devices
+ *  Returned error code
+ *
+ *  @return #hipSuccess, #hipErrorInvalidKernelFile, #hipErrorInvalidValue, #hipErrorNoBinaryForGpu
+ *
+ *  @see FatBinaryInfo::ExtractFatBinaryUsingCOMGR
+ */
+hipError_t CodeObject::extractCodeObjectFromFatBinary(
+    const void* data, size_t size, const std::vector<std::string>& agent_triple_target_ids,
+    std::vector<std::pair<const void*, size_t>>& code_objs) {
+  hipError_t hipStatus = hipSuccess;
+  amd_comgr_status_t comgrStatus = AMD_COMGR_STATUS_SUCCESS;
+
+  const size_t num_devices = agent_triple_target_ids.size();
+  size_t num_code_objs = num_devices;
+  bool isCompressed = false;
+  if (!IsClangOffloadMagicBundle(data, isCompressed)) {
+    LogPrintfInfo("IsClangOffloadMagicBundle(%p) return false", data);
+    // hipModuleLoadData() will possibly call here
+    return hipErrorInvalidKernelFile;
   }
 
-  // retrieve code_objs{binary_image, binary_size} for devices
-  return extractCodeObjectFromFatBinary(*image, device_names, code_objs);
-}
+  if (size == 0) size = getFatbinSize(data, isCompressed);
 
-// This will be moved to COMGR eventually
-hipError_t CodeObject::ExtractCodeObjectFromMemory(
-    const void* data, const std::vector<std::string>& device_names,
-    std::vector<std::pair<const void*, size_t>>& code_objs, std::string& uri) {
-  // Get the URI from memory
-  if (!amd::Os::GetURIFromMemory(data, 0, uri)) {
-    return hipErrorInvalidValue;
-  }
+  amd_comgr_data_t dataCodeObj{0};
+  amd_comgr_data_set_t dataSetBundled{0};
+  amd_comgr_data_set_t dataSetUnbundled{0};
+  amd_comgr_action_info_t actionInfoUnbundle{0};
+  amd_comgr_data_t item{0};
 
-  return extractCodeObjectFromFatBinary(data, device_names, code_objs);
-}
 
-// This will be moved to COMGR eventually
-hipError_t CodeObject::extractCodeObjectFromFatBinary(
-    const void* data, const std::vector<std::string>& agent_triple_target_ids,
-    std::vector<std::pair<const void*, size_t>>& code_objs) {
-  std::string magic((const char*)data, kOffloadBundleMagicStrSize);
-  if (magic.compare(kOffloadBundleMagicStr)) {
-    return hipErrorInvalidKernelFile;
+  std::set<std::string> devicesSet{};  // To make sure device is unique
+  std::vector<const char*> bundleEntryIDs{};
+  static const std::string hipv4 = kOffloadKindHipv4_;  // bundled code objects need the prefix
+  for (size_t i = 0; i < num_devices; i++) {
+    devicesSet.insert(hipv4 + agent_triple_target_ids[i]);
   }
 
-  // Initialize Code objects
-  code_objs.reserve(agent_triple_target_ids.size());
-  for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
-    code_objs.push_back(std::make_pair(nullptr, 0));
+  for (auto& device : devicesSet) {
+    bundleEntryIDs.push_back(device.c_str());
   }
 
-  const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
-  const auto* desc = &obheader->desc[0];
-  size_t num_code_objs = code_objs.size();
-  for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
-                desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
-                    reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
-                    desc->bundleEntryIdSize)) {
-    const void* image =
-        reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
-    const size_t image_size = desc->size;
+  do {
+    // Create Bundled dataset
+    comgrStatus = amd::Comgr::create_data_set(&dataSetBundled);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::create_data_set() failed with status 0x%xh", comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
 
-    if (num_code_objs == 0) break;
-    std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
+    // CodeObject
+    comgrStatus = amd::Comgr::create_data(AMD_COMGR_DATA_KIND_OBJ_BUNDLE, &dataCodeObj);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError(
+          "amd::Comgr::create_data(AMD_COMGR_DATA_KIND_OBJ_BUNDLE) failed with status 0x%xh",
+          comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
 
-    std::string co_triple_target_id;
-    if (!getTripleTargetID(bundleEntryId, image, co_triple_target_id)) continue;
+    comgrStatus = amd::Comgr::set_data(dataCodeObj, size, static_cast<const char *>(data));
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::set_data(size=%zu, data=%p) failed with status 0x%xh", size, data,
+                     comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+
+    comgrStatus = amd::Comgr::set_data_name(dataCodeObj, kHipFatBinName);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError(
+          "amd::Comgr::set_data_name("") failed with status 0x%xh", comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+    comgrStatus = amd::Comgr::data_set_add(dataSetBundled, dataCodeObj);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::data_set_add() failed with status 0x%xh", comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+    // Set up ActionInfo
+    comgrStatus = amd::Comgr::create_action_info(&actionInfoUnbundle);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::create_action_info() failed with status 0x%xh", comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
 
-    for (size_t dev = 0; dev < agent_triple_target_ids.size(); ++dev) {
-      if (code_objs[dev].first) continue;
-      if (isCodeObjectCompatibleWithDevice(co_triple_target_id, agent_triple_target_ids[dev])) {
-        code_objs[dev] = std::make_pair(image, image_size);
-        --num_code_objs;
+    comgrStatus = amd::Comgr::action_info_set_language(actionInfoUnbundle, AMD_COMGR_LANGUAGE_HIP);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::action_info_set_language(HIP) failed with status 0x%xh",
+                     comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+
+    comgrStatus = amd::Comgr::action_info_set_bundle_entry_ids(
+        actionInfoUnbundle, bundleEntryIDs.data(), bundleEntryIDs.size());
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::action_info_set_bundle_entry_ids(%p, %zu) failed with status 0x%xh",
+                     bundleEntryIDs.data(), bundleEntryIDs.size(), comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+
+    // Unbundle
+    comgrStatus = amd::Comgr::create_data_set(&dataSetUnbundled);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::create_data_set(&dataSetUnbundled) failed with status 0x%xh",
+                     comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+    comgrStatus = amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE, actionInfoUnbundle, dataSetBundled,
+                                      dataSetUnbundled);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE) failed with status 0x%xh",
+                     comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+
+    // Check CodeObject count
+    size_t count = 0;
+    comgrStatus =
+        amd::Comgr::action_data_count(dataSetUnbundled, AMD_COMGR_DATA_KIND_EXECUTABLE, &count);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::action_data_count() failed with status 0x%xh", comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+      break;
+    }
+
+    // Initialize Code objects
+    code_objs.reserve(num_code_objs);
+    for (size_t i = 0; i < num_code_objs; i++) {
+      code_objs.push_back(std::make_pair(nullptr, 0));
+    }
+
+    for (size_t i = 0; i < count; i++) {
+      if (num_code_objs == 0) break;
+
+      size_t itemSize = 0;
+      comgrStatus = amd::Comgr::action_data_get_data(dataSetUnbundled,
+                                                AMD_COMGR_DATA_KIND_EXECUTABLE, i, &item);
+      if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+        LogPrintfError("amd::Comgr::action_data_get_data(%zu/%zu) failed with 0x%xh", i, count,
+                       comgrStatus);
+        hipStatus = hipErrorInvalidValue;
+        break;
+      }
+
+      comgrStatus = amd::Comgr::get_data_name(item, &itemSize, nullptr);
+      if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+        LogPrintfError("amd::Comgr::get_data_name(%zu/%zu) failed with 0x%xh", i, count,
+                       comgrStatus);
+        hipStatus = hipErrorInvalidValue;
+        break;
+      }
+      std::string bundleEntryId(itemSize, 0);
+      comgrStatus = amd::Comgr::get_data_name(item, &itemSize, bundleEntryId.data());
+      if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+        LogPrintfError("amd::Comgr::get_data_name(%zu/%zu, %d) failed with 0x%xh", i, count,
+                       itemSize, comgrStatus);
+        hipStatus = hipErrorInvalidValue;
+        break;
+      }
+      // Remove bundleEntryId_
+      if (!consume(bundleEntryId, kOffloadHipV4FatBinName_)) {
+        // This is behavour in comgr unbundling which is subject to change.
+        // So just give info.
+        LogPrintfInfo("bundleEntryId=%s isn't prefixed with %s", bundleEntryId.c_str(),
+                      kOffloadHipV4FatBinName_);
+      }
+      trimNameTail(bundleEntryId, '.'); // Remove .fileExtention
+
+      char* itemData = nullptr;
+      for (size_t dev = 0; dev < num_devices; ++dev) {
+        if (code_objs[dev].first) continue;
+        //LogPrintfError("agent_triple_target_ids[%zu]=%s, bundleEntryId=%s", dev,
+        //               agent_triple_target_ids[dev].c_str(), bundleEntryId.c_str());
+
+        if (bundleEntryId == agent_triple_target_ids[dev]) {
+          if (itemData == nullptr) {
+            itemSize = 0;
+            comgrStatus = amd::Comgr::get_data(item, &itemSize, nullptr);
+            if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+              LogPrintfError("amd::Comgr::get_data(%zu/%zu) failed with 0x%xh", i, count,
+                             comgrStatus);
+              hipStatus = hipErrorInvalidValue;
+              break;
+            }
+
+            if (itemSize == 0) {
+              // If there isn't a code object for this device,
+              // amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE) still returns item with
+              // valid name but no data. We need continue searching for other devices
+              LogPrintfInfo(
+                "amd::Comgr::get_data() return 0 size for agent_triple_target_ids[%zu]=%s",
+                dev, agent_triple_target_ids[dev].c_str());
+              continue;
+            }
+
+            // itemData should be deleted in fatbin's destructor
+            itemData = new char[itemSize];
+            if (itemData == nullptr) {
+              LogError("no enough memory");
+              hipStatus = hipErrorOutOfMemory;
+              break;
+            }
+            comgrStatus = amd::Comgr::get_data(item, &itemSize, itemData);
+            if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+              LogPrintfError("amd::Comgr::get_data(%zu/%zu, %d) failed with 0x%xh", i, count,
+                             itemSize, comgrStatus);
+              hipStatus = hipErrorInvalidValue;
+              delete []itemData;
+              itemData = nullptr;
+              break;
+            }
+
+          }
+          code_objs[dev] = std::make_pair(reinterpret_cast<const void*>(itemData), itemSize);
+          --num_code_objs;
+          LogPrintfInfo(
+              "Found agent_triple_target_ids[%zu]=%s: item: Data=%p(%s), "
+              "Size=%zu, num_code_objs=%zu",
+              dev, agent_triple_target_ids[dev].c_str(), itemData,
+              isCompressed ? "compressed" : "uncompressed", itemSize, num_code_objs);
+        }
+      }
+
+      comgrStatus = amd::Comgr::release_data(item);
+      item.handle = 0;
+      if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+        LogPrintfError("amd::Comgr::release_data(item) failed with status 0x%xh", comgrStatus);
+        hipStatus = hipErrorInvalidValue;
       }
+      if (hipStatus != hipSuccess) break;
     }
-  }
-  if (num_code_objs == 0) {
-    return hipSuccess;
-  } else {
-    LogPrintfError("%s",
-                   "hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
-    LogPrintfError("%s", "  Devices:");
-    for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
-      LogPrintfError("    %s - [%s]", agent_triple_target_ids[i].c_str(),
-                     ((code_objs[i].first) ? "Found" : "Not Found"));
-    }
-    const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
-    const auto* desc = &obheader->desc[0];
-    LogPrintfError("%s", "  Bundled Code Objects:");
-    for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
-                  desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
-                      reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
-                      desc->bundleEntryIdSize)) {
-      std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
-      const void* image =
-          reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
-
-      std::string co_triple_target_id;
-      bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id);
-
-      if (valid_co) {
-        LogPrintfError("    %s - [Code object targetID is %s]", bundleEntryId.c_str(),
-                       co_triple_target_id.c_str());
-      } else {
-        LogPrintfError("    %s - [Unsupported]", bundleEntryId.c_str());
+  } while(0);
+
+  if (hipStatus == hipSuccess && num_code_objs != 0) {
+    hipStatus = hipErrorNoBinaryForGpu;
+
+    // Leave it for debug purpose in uncompressed mode.
+    if (!isCompressed) {
+      LogPrintfError("%s",
+                     "hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
+      LogPrintfError("%s", "  Devices:");
+      for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
+        LogPrintfError("    %s - [%s]", agent_triple_target_ids[i].c_str(),
+                       ((code_objs[i].first) ? "Found" : "Not Found"));
+      }
+      const auto obheader = reinterpret_cast<const __ClangOffloadBundleUncompressedHeader*>(data);
+      const auto* desc = &obheader->desc[0];
+      LogPrintfError("%s", "  Bundled Code Objects:");
+      for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
+                    desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
+                        reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
+                        desc->bundleEntryIdSize)) {
+        std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
+        const void* image =
+            reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
+
+        std::string co_triple_target_id;
+        bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id);
+
+        if (valid_co) {
+          LogPrintfError("    %s - [Code object targetID is %s]", bundleEntryId.c_str(),
+                         co_triple_target_id.c_str());
+        } else {
+          LogPrintfError("    %s - [Unsupported]", bundleEntryId.c_str());
+        }
       }
     }
-    return hipErrorNoBinaryForGpu;
   }
+
+  // Cleanup
+  if (actionInfoUnbundle.handle) {
+    comgrStatus = amd::Comgr::destroy_action_info(actionInfoUnbundle);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::destroy_action_info(actionInfoUnbundle) failed with status 0x%xh",
+        comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+    }
+  }
+  if (dataSetBundled.handle) {
+    comgrStatus = amd::Comgr::destroy_data_set(dataSetBundled);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::destroy_data_set(dataSetBundled) failed with status 0x%xh",
+        comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+    }
+  }
+
+  if (dataSetUnbundled.handle) {
+    comgrStatus = amd::Comgr::destroy_data_set(dataSetUnbundled);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::destroy_data_set(dataSetUnbundled) failed with status 0x%xh",
+        comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+    }
+  }
+
+  if (dataCodeObj.handle) {
+    comgrStatus = amd::Comgr::release_data(dataCodeObj);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::release_data(dataCodeObj) failed with status 0x%xh",
+        comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+    }
+  }
+
+  if (item.handle) {
+    comgrStatus = amd::Comgr::release_data(item);
+    if (comgrStatus != AMD_COMGR_STATUS_SUCCESS) {
+      LogPrintfError("amd::Comgr::release_data(item) failed with status 0x%xh",
+        comgrStatus);
+      hipStatus = hipErrorInvalidValue;
+    }
+  }
+
+  return hipStatus;
 }
 
 hipError_t DynCO::loadCodeObject(const char* fname, const void* image) {
@@ -871,7 +1159,8 @@ hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hos
 hipError_t StatCO::registerStatGlobalVar(const void* hostVar, Var* var) {
   amd::ScopedLock lock(sclock_);
 
-  if (vars_.find(hostVar) != vars_.end()) {
+  auto var_it = vars_.find(hostVar);
+  if ((var_it != vars_.end()) && (var_it->second->getName() != var->getName())) {
     return hipErrorInvalidSymbol;
   }
 
diff --git a/hipamd/src/hip_code_object.hpp b/hipamd/src/hip_code_object.hpp
index d0148cb8a..d5b6df0a8 100644
--- a/hipamd/src/hip_code_object.hpp
+++ b/hipamd/src/hip_code_object.hpp
@@ -48,31 +48,22 @@ class CodeObject {
                                 size_t binary_size);
   static hipError_t build_module(hipModule_t hmod, const std::vector<amd::Device*>& devices);
 
-  // Given an file desc and file size, extracts to code object for corresponding devices,
-  // return code_objs{binary_ptr, binary_size}, which could be used to determine foffset
-  static hipError_t ExtractCodeObjectFromFile(amd::Os::FileDesc fdesc, size_t fsize,
-                    const void ** image, const std::vector<std::string>& device_names,
-                    std::vector<std::pair<const void*, size_t>>& code_objs);
-
-  // Given an ptr to memory, extracts to code object for corresponding devices,
-  // returns code_objs{binary_ptr, binary_size} and uniform resource indicator
-  static hipError_t ExtractCodeObjectFromMemory(const void* data,
-                    const std::vector<std::string>& device_names,
-                    std::vector<std::pair<const void*, size_t>>& code_objs,
-                    std::string& uri);
-
   static uint64_t ElfSize(const void* emi);
 
-  static bool IsClangOffloadMagicBundle(const void* data);
+  static bool IsClangOffloadMagicBundle(const void* data, bool& isCompressed);
 
-protected:
-  //Given an ptr to image or file, extracts to code object
-  //for corresponding devices
-  static hipError_t extractCodeObjectFromFatBinary(const void*,
-                    const std::vector<std::string>&,
-                    std::vector<std::pair<const void*, size_t>>&);
+  // Given an ptr to image or file, extracts to code object
+  // for corresponding devices
+  static hipError_t extractCodeObjectFromFatBinary(
+      const void*, size_t, const std::vector<std::string>&,
+      std::vector<std::pair<const void*, size_t>>&);
+
+  // Return size of fat bin
+  static size_t getFatbinSize(const void* data, const bool isCompressed = false);
 
+protected:
   CodeObject() {}
+
 private:
   friend const std::vector<hipModule_t>& modules();
 };
@@ -96,8 +87,12 @@ class DynCO : public CodeObject {
   hipError_t getManagedVarPointer(std::string name, void** pointer, size_t* size_ptr) const {
     auto it = vars_.find(name);
     if (it != vars_.end() && it->second->getVarKind() == Var::DVK_Managed) {
-      *pointer = it->second->getManagedVarPtr();
-      *size_ptr = it->second->getSize();
+      if (pointer != nullptr) {
+        *pointer = it->second->getManagedVarPtr();
+      }
+      if (size_ptr != nullptr) {
+        *size_ptr = it->second->getSize();
+      }
     }
     return hipSuccess;
   }
diff --git a/hipamd/src/hip_context.cpp b/hipamd/src/hip_context.cpp
index 4b7f5c809..6289a9891 100644
--- a/hipamd/src/hip_context.cpp
+++ b/hipamd/src/hip_context.cpp
@@ -61,6 +61,7 @@ void init(bool* status) {
       return;
     }
     g_devices.push_back(device);
+    amd::RuntimeTearDown::RegisterObject(device);
   }
 
   amd::Context* hContext = new amd::Context(devices, amd::Context::Info());
@@ -73,6 +74,7 @@ void init(bool* status) {
     hContext->release();
   }
   host_context = hContext;
+  amd::RuntimeTearDown::RegisterObject(hContext);
 
   PlatformState::instance().init();
   *status = true;
@@ -95,7 +97,7 @@ hip::Stream* getStream(hipStream_t stream, bool wait) {
     hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(stream);
     if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) {
       constexpr bool WaitNullStreamOnly = true;
-      iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly);
+      hip_stream->GetDevice()->WaitActiveStreams(hip_stream, WaitNullStreamOnly);
     }
     return hip_stream;
   }
diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp
index 019ee223e..c4c704b1b 100644
--- a/hipamd/src/hip_device.cpp
+++ b/hipamd/src/hip_device.cpp
@@ -43,7 +43,7 @@ hip::Stream* Device::NullStream(bool wait) {
   }
   if (wait == true) {
     // Wait for all active streams before executing commands on the default
-    iHipWaitActiveStreams(null_stream_);
+    WaitActiveStreams(null_stream_);
   }
   return null_stream_;
 }
@@ -57,7 +57,7 @@ bool Device::Create() {
   }
 
   // Create graph memory pool
-  graph_mem_pool_ = new MemoryPool(this);
+  graph_mem_pool_ = new MemoryPool(this, nullptr, true);
   if (graph_mem_pool_ == nullptr) {
     return false;
   }
@@ -149,11 +149,150 @@ void Device::Reset() {
     mem_pools_.clear();
   }
   flags_ = hipDeviceScheduleSpin;
-  hip::Stream::destroyAllStreams(deviceId_);
+  destroyAllStreams();
   amd::MemObjMap::Purge(devices()[0]);
   Create();
 }
 
+// ================================================================================================
+void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) {
+  amd::Command::EventWaitList eventWaitList(0);
+  bool submitMarker = 0;
+
+  auto waitForStream = [&submitMarker,
+                         &eventWaitList](hip::Stream* stream) {
+    if (amd::Command *command = stream->getLastQueuedCommand(true)) {
+      amd::Event &event = command->event();
+      // Check HW status of the ROCcrl event.
+      // Note: not all ROCclr modes support HW status
+      bool ready = stream->device().IsHwEventReady(event);
+      if (!ready) {
+        ready = (command->status() == CL_COMPLETE);
+      }
+      submitMarker |= stream->vdev()->isFenceDirty();
+      // Check the current active status
+      if (!ready) {
+        command->notifyCmdQueue();
+        eventWaitList.push_back(command);
+      } else {
+        command->release();
+      }
+    }
+  };
+
+  if (wait_null_stream) {
+    if (null_stream_) {
+      waitForStream(null_stream_);
+    }
+  } else {
+    amd::ScopedLock lock(streamSetLock);
+
+    for (const auto& active_stream : streamSet) {
+      // If it's the current device
+      if (// Make sure it's a default stream
+        ((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
+        // and it's not the current stream
+        (active_stream != blocking_stream)) {
+        // Get the last valid command
+        waitForStream(active_stream);
+      }
+    }
+  }
+
+  // Check if we have to wait anything
+  if (eventWaitList.size() > 0 || submitMarker) {
+    amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList);
+    if (command != nullptr) {
+      command->enqueue();
+      command->release();
+    }
+  }
+
+  // Release all active commands. It's safe after the marker was enqueued
+  for (const auto& it : eventWaitList) {
+    it->release();
+  }
+}
+
+// ================================================================================================
+void Device::AddStream(Stream* stream) {
+  amd::ScopedLock lock(streamSetLock);
+  streamSet.insert(stream);
+}
+
+// ================================================================================================
+void Device::RemoveStream(Stream* stream){
+  amd::ScopedLock lock(streamSetLock);
+  streamSet.erase(stream);
+}
+
+// ================================================================================================
+bool Device::StreamExists(Stream* stream){
+  amd::ScopedLock lock(streamSetLock);
+  if (streamSet.find(stream) != streamSet.end()) {
+    return true;
+  }
+  return false;
+}
+
+// ================================================================================================
+void Device::destroyAllStreams() {
+  std::vector<Stream*> toBeDeleted;
+  {
+    amd::ScopedLock lock(streamSetLock);
+    for (auto& it : streamSet) {
+      if (it->Null() == false ) {
+        toBeDeleted.push_back(it);
+      }
+    }
+  }
+  for (auto& it : toBeDeleted) {
+    hip::Stream::Destroy(it);
+  }
+}
+
+// ================================================================================================
+void Device::SyncAllStreams( bool cpu_wait) {
+  // Make a local copy to avoid stalls for GPU finish with multiple threads
+  std::vector<hip::Stream*> streams;
+  streams.reserve(streamSet.size());
+  {
+    amd::ScopedLock lock(streamSetLock);
+    for (auto it : streamSet) {
+      streams.push_back(it);
+      it->retain();
+    }
+  }
+  for (auto it : streams) {
+    it->finish(cpu_wait);
+    it->release();
+  }
+  // Release freed memory for all memory pools on the device
+  ReleaseFreedMemory();
+}
+
+// ================================================================================================
+bool Device::StreamCaptureBlocking() {
+  amd::ScopedLock lock(streamSetLock);
+  for (auto& it : streamSet) {
+    if (it->GetCaptureStatus() == hipStreamCaptureStatusActive && it->Flags() != hipStreamNonBlocking) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// ================================================================================================
+bool Device::existsActiveStreamForDevice() {
+  amd::ScopedLock lock(streamSetLock);
+  for (const auto& active_stream : streamSet) {
+    if (active_stream->GetQueueStatus()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // ================================================================================================
 Device::~Device() {
   if (default_mem_pool_ != nullptr) {
diff --git a/hipamd/src/hip_device_runtime.cpp b/hipamd/src/hip_device_runtime.cpp
index 19a045dba..a735d2039 100644
--- a/hipamd/src/hip_device_runtime.cpp
+++ b/hipamd/src/hip_device_runtime.cpp
@@ -541,7 +541,12 @@ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device) {
 
   hipDeviceProp_tR0600 prop;
   HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
-  snprintf(pciBusId, len, "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
+  auto* deviceHandle = g_devices[device]->devices()[0];
+  snprintf (pciBusId, len, "%04x:%02x:%02x.%01x",
+                    prop.pciDomainID,
+                    prop.pciBusID,
+                    prop.pciDeviceID,
+                    deviceHandle->info().deviceTopology_.pcie.function);
 
   HIP_RETURN(len <= 12 ? hipErrorInvalidValue : hipSuccess);
 }
@@ -609,8 +614,9 @@ hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
 
 hipError_t hipDeviceSynchronize() {
   HIP_INIT_API(hipDeviceSynchronize);
+  CHECK_SUPPORTED_DURING_CAPTURE();
   constexpr bool kDoWaitForCpu = true;
-  hip::Stream::SyncAllStreams(hip::getCurrentDevice()->deviceId(), kDoWaitForCpu);
+  hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu);
   HIP_RETURN(hipSuccess);
 }
 
diff --git a/hipamd/src/hip_event.cpp b/hipamd/src/hip_event.cpp
index f527766e4..8a49bfd8e 100644
--- a/hipamd/src/hip_event.cpp
+++ b/hipamd/src/hip_event.cpp
@@ -434,9 +434,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) {
   hip::Event* e = reinterpret_cast<hip::Event*>(event);
   hip::Stream* s = reinterpret_cast<hip::Stream*>(e->GetCaptureStream());
   if ((s != nullptr) && (s->GetCaptureStatus() == hipStreamCaptureStatusActive)) {
-    if (s->IsEventCaptured(event) == false) {
-      return HIP_RETURN(hipErrorStreamCaptureUnsupported);
-    }
+      s->SetCaptureStatus(hipStreamCaptureStatusInvalidated);
+      return HIP_RETURN(hipErrorCapturedEvent);
   }
   if (hip::Stream::StreamCaptureOngoing(e->GetCaptureStream()) == true) {
     HIP_RETURN(hipErrorStreamCaptureUnsupported);
@@ -455,6 +454,11 @@ hipError_t ihipEventQuery(hipEvent_t event) {
   }
 
   hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(e->GetCaptureStream());
+  if ((s != nullptr) && (s->GetCaptureStatus() == hipStreamCaptureStatusActive)) {
+    s->SetCaptureStatus(hipStreamCaptureStatusInvalidated);
+    return HIP_RETURN(hipErrorCapturedEvent);
+  }
   return e->query();
 }
 
diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp
index 562f19386..a66b1106d 100644
--- a/hipamd/src/hip_fatbin.cpp
+++ b/hipamd/src/hip_fatbin.cpp
@@ -50,57 +50,46 @@ FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) : fdesc_(amd:
 }
 
 FatBinaryInfo::~FatBinaryInfo() {
-
+  // Different devices in the same model have the same binary_image_
+  std::set<const void*> toDelete;
   // Release per device fat bin info.
   for (auto* fbd: fatbin_dev_info_) {
     if (fbd != nullptr) {
+      if (fbd->binary_image_ && fbd->binary_offset_ == 0 && fbd->binary_image_ != image_) {
+        // binary_image_ was allocated in CodeObject::extractCodeObjectFromFatBinary
+        toDelete.insert(fbd->binary_image_);
+      }
       delete fbd;
     }
   }
 
-  if (!HIP_USE_RUNTIME_UNBUNDLER) {
-    // Using COMGR Unbundler
-    if (ufd_ && amd::Os::isValidFileDesc(ufd_->fdesc_)) {
-      // Check for ufd_ != nullptr, since sometimes, we never create unique_file_desc.
-      if (ufd_->fsize_ && image_mapped_
-           && !amd::Os::MemoryUnmapFile(image_, ufd_->fsize_)) {
-        LogPrintfError("Cannot unmap file for fdesc: %d fsize: %d", ufd_->fdesc_, ufd_->fsize_);
-        assert(false);
-      }
-      if (!PlatformState::instance().CloseUniqueFileHandle(ufd_)) {
-        LogPrintfError("Cannot close file for fdesc: %d", ufd_->fdesc_);
-        assert(false);
-      }
-    }
-
-    fname_ = std::string();
-    fdesc_ = amd::Os::FDescInit();
-    fsize_ = 0;
-    image_ = nullptr;
-    uri_ = std::string();
+  for (auto itemData : toDelete) {
+    LogPrintfInfo("~FatBinaryInfo(%p) will delete binary_image_ %p", this, itemData);
+    delete[] reinterpret_cast<const char*>(itemData);
+  }
 
-    if (0 == PlatformState::instance().UfdMapSize()) {
-      LogError("All Unique FDs are closed");
+  // Using COMGR Unbundler
+  if (ufd_ && amd::Os::isValidFileDesc(ufd_->fdesc_)) {
+    // Check for ufd_ != nullptr, since sometimes, we never create unique_file_desc.
+    if (ufd_->fsize_ && image_mapped_
+          && !amd::Os::MemoryUnmapFile(image_, ufd_->fsize_)) {
+      LogPrintfError("Cannot unmap file for fdesc: %d fsize: %d", ufd_->fdesc_, ufd_->fsize_);
+      assert(false);
     }
-
-  } else {
-    // Using Runtime Unbundler
-    if (amd::Os::isValidFileDesc(fdesc_)) {
-      if (fsize_ && !amd::Os::MemoryUnmapFile(image_, fsize_)) {
-        LogPrintfError("Cannot unmap file for fdesc: %d fsize: %d", fdesc_, fsize_);
-        assert(false);
-      }
-      if (!amd::Os::CloseFileHandle(fdesc_)) {
-        LogPrintfError("Cannot close file for fdesc: %d", fdesc_);
-        assert(false);
-      }
+    if (!PlatformState::instance().CloseUniqueFileHandle(ufd_)) {
+      LogPrintfError("Cannot close file for fdesc: %d", ufd_->fdesc_);
+      assert(false);
     }
+  }
 
-    fname_ = std::string();
-    fdesc_ = amd::Os::FDescInit();
-    fsize_ = 0;
-    image_ = nullptr;
-    uri_ = std::string();
+  fname_ = std::string();
+  fdesc_ = amd::Os::FDescInit();
+  fsize_ = 0;
+  image_ = nullptr;
+  uri_ = std::string();
+
+  if (0 == PlatformState::instance().UfdMapSize()) {
+    LogError("All Unique FDs are closed");
   }
 }
 
@@ -114,11 +103,8 @@ void ListAllDeviceWithNoCOFromBundle(const std::unordered_map<std::string,
   }
 }
 
-hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices) {
-  amd_comgr_data_t data_object {0};
-  amd_comgr_status_t comgr_status = AMD_COMGR_STATUS_SUCCESS;
+hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
   hipError_t hip_status = hipSuccess;
-
   // If image was passed as a pointer to our hipMod* api, we can try to extract the file name
   // if it was mapped by the app. Otherwise use the COMGR data API.
   if (fname_.size() == 0) {
@@ -163,107 +149,52 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
                                 fname_.c_str());
 
   do {
-
-    // If the image ptr is not clang offload bundle then just directly point the image.
-    if (!CodeObject::IsClangOffloadMagicBundle(image_)) {
-      for (size_t dev_idx=0; dev_idx < devices.size(); ++dev_idx) {
-        fatbin_dev_info_[devices[dev_idx]->deviceId()]
-          = new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
-        fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
-          = new amd::Program(*devices[dev_idx]->asContext());
+    std::vector<std::pair<const void*, size_t>> code_objs;
+    // Copy device names
+    std::vector<std::string> device_names;
+    device_names.reserve(devices.size());
+    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+      device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
+    }
+    hip_status = CodeObject::extractCodeObjectFromFatBinary(
+      image_, 0, device_names, code_objs);
+    if (hip_status == hipErrorNoBinaryForGpu || hip_status == hipSuccess) {
+      for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+        if (code_objs[dev_idx].first) {
+            fatbin_dev_info_[devices[dev_idx]->deviceId()]
+            = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, 0);
+
+            fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
+            = new amd::Program(*devices[dev_idx]->asContext());
+          if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
+            break;
+          }
+        }
+        else {
+          // This is the case of hipErrorNoBinaryForGpu which will finally fail app
+          LogPrintfError("Cannot find CO in the bundle %s for ISA: %s", fname_.c_str(),
+                         device_names[dev_idx].c_str());
+        }
+      }
+    }
+    else if (hip_status == hipErrorInvalidKernelFile) {
+      hip_status = hipSuccess;
+      // If the image ptr is not clang offload bundle then just directly point the image.
+      for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
+        fatbin_dev_info_[devices[dev_idx]->deviceId()] =
+            new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
+        fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ =
+            new amd::Program(*devices[dev_idx]->asContext());
         if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == nullptr) {
           hip_status = hipErrorOutOfMemory;
           break;
         }
       }
-      break;
     }
-
-    // Create a data object, if it fails return error
-    if ((comgr_status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_FATBIN, &data_object))
-                        != AMD_COMGR_STATUS_SUCCESS) {
-      LogPrintfError("Creating data object failed with status %d ", comgr_status);
-      hip_status = hipErrorInvalidValue;
-      break;
-    }
-
-#if !defined(_WIN32)
-    // Using the file descriptor and file size, map the data object.
-    if (amd::Os::isValidFileDesc(fdesc_)) {
-      guarantee(fsize_ > 0, "Cannot have a file size of 0, fdesc: %d fname: %s",
-                             fdesc_, fname_.c_str());
-      if ((comgr_status = amd_comgr_set_data_from_file_slice(data_object, fdesc_, foffset_,
-                          fsize_)) != AMD_COMGR_STATUS_SUCCESS) {
-        LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
-        hip_status = hipErrorInvalidValue;
-        break;
-      }
-    } else
-#endif
-    if (image_ != nullptr) {
-      // Using the image ptr, map the data object.
-      if ((comgr_status = amd_comgr_set_data(data_object, 4096,
-                          reinterpret_cast<const char*>(image_))) != AMD_COMGR_STATUS_SUCCESS) {
-        LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
-        hip_status = hipErrorInvalidValue;
-        break;
-      }
-    } else {
-      guarantee(false, "Cannot have both fname_ and image_ as nullptr");
-    }
-
-    // Find the unique number of ISAs needed for this COMGR query.
-    std::unordered_map<std::string, std::pair<size_t, size_t>> unique_isa_names;
-    for (auto device : devices) {
-      std::string device_name = device->devices()[0]->isa().isaName();
-      unique_isa_names.insert({device_name, std::make_pair<size_t, size_t>(0,0)});
-    }
-
-    // Create a query list using COMGR info for unique ISAs.
-    std::vector<amd_comgr_code_object_info_t> query_list_array;
-    query_list_array.reserve(unique_isa_names.size());
-    for (const auto &isa_name : unique_isa_names) {
-      auto &item = query_list_array.emplace_back();
-      item.isa = isa_name.first.c_str();
-      item.size = 0;
-      item.offset = 0;
-    }
-
-    // Look up the code object info passing the query list.
-    if ((comgr_status = amd_comgr_lookup_code_object(data_object, query_list_array.data(),
-                        unique_isa_names.size())) != AMD_COMGR_STATUS_SUCCESS) {
-      LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
-      hip_status = hipErrorInvalidValue;
-      break;
-    }
-
-    for (const auto &item : query_list_array) {
-      auto unique_it = unique_isa_names.find(item.isa);
-      guarantee(unique_isa_names.cend() != unique_it, "Cannot find unique isa ");
-      unique_it->second = std::pair<size_t, size_t>
-                            (static_cast<size_t>(item.size),
-                             static_cast<size_t>(item.offset));
-    }
-
-    for (auto device : devices) {
-      std::string device_name = device->devices()[0]->isa().isaName();
-      auto dev_it = unique_isa_names.find(device_name);
-      // If the size is 0, then COMGR API could not find the CO for this GPU device/ISA
-      if (dev_it->second.first == 0) {
-        LogPrintfError("Cannot find CO in the bundle %s for ISA: %s",
-                        fname_.c_str(), device_name.c_str());
-        hip_status = hipErrorNoBinaryForGpu;
-        ListAllDeviceWithNoCOFromBundle(unique_isa_names);
-        break;
-      }
-      guarantee(unique_isa_names.cend() != dev_it,
-                "Cannot find the device name in the unique device name");
-      fatbin_dev_info_[device->deviceId()]
-        = new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
-                                  + dev_it->second.second, dev_it->second.first,
-                                                           dev_it->second.second);
-      fatbin_dev_info_[device->deviceId()]->program_
-        = new amd::Program(*(device->asContext()));
+    else {
+      LogPrintfError(
+        "CodeObject::extractCodeObjectFromFatBinary failed with status %d\n",
+        hip_status);
     }
   } while(0);
 
@@ -286,115 +217,9 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
       fsize_ = 0;
     }
   }
-
-  if (data_object.handle) {
-    if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) {
-      LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status);
-      return hipErrorInvalidValue;
-    }
-  }
-
   return hip_status;
 }
 
-hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
-  if (!HIP_USE_RUNTIME_UNBUNDLER) {
-    return ExtractFatBinaryUsingCOMGR(devices);
-  }
-
-  hipError_t hip_error = hipSuccess;
-  std::vector<std::pair<const void*, size_t>> code_objs;
-
-  // Copy device names for Extract Code object File
-  std::vector<std::string> device_names;
-  device_names.reserve(devices.size());
-  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
-    device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
-  }
-
-  // We are given file name, get the file desc and file size
-  if (fname_.size() > 0) {
-    // Get File Handle & size of the file.
-    if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
-      return hipErrorFileNotFound;
-    }
-    if (fsize_ == 0) {
-      return hipErrorInvalidImage;
-    }
-
-    // Extract the code object from file
-    hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, &image_,
-                device_names, code_objs);
-
-  } else if (image_ != nullptr) {
-    // We are directly given image pointer directly, try to extract file desc & file Size
-    hip_error = CodeObject::ExtractCodeObjectFromMemory(image_,
-                device_names, code_objs, uri_);
-  } else {
-    return hipErrorInvalidValue;
-  }
-
-  if (hip_error == hipErrorNoBinaryForGpu) {
-    if (fname_.size() > 0) {
-      LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for file: %s", fname_.c_str());
-    } else {
-      LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for ptr: 0x%x", image_);
-    }
-
-    // For the condition: unable to find code object for all devices,
-    // still extract available images to those devices owning them.
-    // This helps users to work with ROCm if there is any supported
-    // GFX on system.
-    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
-      if (code_objs[dev_idx].first) {
-        // Calculate the offset wrt binary_image and the original image
-        size_t offset_l
-          = (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
-              - reinterpret_cast<address>(const_cast<void*>(image_)));
-
-        fatbin_dev_info_[devices[dev_idx]->deviceId()]
-          = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
-
-        fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
-          = new amd::Program(*devices[dev_idx]->asContext());
-        if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
-          break;
-        }
-      }
-    }
-
-    return hip_error;
-  }
-
-  if (hip_error == hipErrorInvalidKernelFile) {
-    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
-      // the image type is no CLANG_OFFLOAD_BUNDLER, image for current device directly passed
-      fatbin_dev_info_[devices[dev_idx]->deviceId()]
-        = new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
-    }
-  } else if(hip_error == hipSuccess) {
-    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
-      // Calculate the offset wrt binary_image and the original image
-      size_t offset_l
-        = (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
-            - reinterpret_cast<address>(const_cast<void*>(image_)));
-
-      fatbin_dev_info_[devices[dev_idx]->deviceId()]
-        = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
-    }
-  }
-
-  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
-    fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
-       = new amd::Program(*devices[dev_idx]->asContext());
-    if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
-      return hipErrorOutOfMemory;
-    }
-  }
-
-  return hipSuccess;
-}
-
 hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
   // Device Id bounds Check
   DeviceIdCheck(device_id);
diff --git a/hipamd/src/hip_fatbin.hpp b/hipamd/src/hip_fatbin.hpp
index f9057a4b5..ac6d64e19 100644
--- a/hipamd/src/hip_fatbin.hpp
+++ b/hipamd/src/hip_fatbin.hpp
@@ -64,7 +64,6 @@ class FatBinaryInfo {
   ~FatBinaryInfo();
 
   // Loads Fat binary from file or image, unbundles COs for devices.
-  hipError_t ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices);
   hipError_t ExtractFatBinary(const std::vector<hip::Device*>& devices);
   hipError_t AddDevProgram(const int device_id);
   hipError_t BuildProgram(const int device_id);
diff --git a/hipamd/src/hip_global.hpp b/hipamd/src/hip_global.hpp
index 6de240b90..a3f6d29a0 100644
--- a/hipamd/src/hip_global.hpp
+++ b/hipamd/src/hip_global.hpp
@@ -126,6 +126,7 @@ class Var {
   FatBinaryInfo** moduleInfo() { return modules_; };
   DeviceVarKind getVarKind() const { return dVarKind_; }
   size_t getSize() const { return size_; }
+  std::string getName() const { return name_; }
 
   void* getManagedVarPtr() { return managedVarPtr_; };
   void setManagedVarInfo(void* pointer, size_t size) {
diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp
index 0fff3323c..8440be6f9 100644
--- a/hipamd/src/hip_graph.cpp
+++ b/hipamd/src/hip_graph.cpp
@@ -137,7 +137,7 @@ hipError_t ihipDrvGraphAddMemcpyNode(hip::GraphNode** pGraphNode, hip::Graph* gr
       (numDependencies > 0 && pDependencies == nullptr) || pCopyParams == nullptr) {
     return hipErrorInvalidValue;
   }
-  hipError_t status = ihipDrvMemcpy3DParamValidate(pCopyParams);
+  hipError_t status = ihipDrvMemcpy3D_validate(pCopyParams);
   if (status != hipSuccess) {
     return status;
   }
@@ -165,7 +165,8 @@ hipError_t ihipGraphAddMemcpyNode1D(hip::GraphNode** pGraphNode, hip::Graph* gra
 
 hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph,
                                   hip::GraphNode* const* pDependencies, size_t numDependencies,
-                                  const hipMemsetParams* pMemsetParams, bool capture = true) {
+                                  const hipMemsetParams* pMemsetParams,
+                                  bool capture = true, size_t depth = 1) {
   if (pGraphNode == nullptr || graph == nullptr || pMemsetParams == nullptr ||
       (numDependencies > 0 && pDependencies == nullptr) || pMemsetParams->height == 0) {
     return hipErrorInvalidValue;
@@ -181,6 +182,9 @@ hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph
   if (status != hipSuccess) {
     return status;
   }
+  if (depth == 0) {
+    return hipErrorInvalidValue;
+  }
   if (pMemsetParams->height == 1) {
     status =
         ihipMemset_validate(pMemsetParams->dst, pMemsetParams->value, pMemsetParams->elementSize,
@@ -189,15 +193,16 @@ hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph
     if (pMemsetParams->pitch < (pMemsetParams->width * pMemsetParams->elementSize)) {
       return hipErrorInvalidValue;
     }
-    auto sizeBytes = pMemsetParams->width * pMemsetParams->height * pMemsetParams->elementSize * 1;
+    auto sizeBytes = pMemsetParams->width * pMemsetParams->height *
+                     depth * pMemsetParams->elementSize;
     status = ihipMemset3D_validate(
         {pMemsetParams->dst, pMemsetParams->pitch, pMemsetParams->width, pMemsetParams->height},
-        pMemsetParams->value, {pMemsetParams->width, pMemsetParams->height, 1}, sizeBytes);
+        pMemsetParams->value, {pMemsetParams->width, pMemsetParams->height, depth}, sizeBytes);
   }
   if (status != hipSuccess) {
     return status;
   }
-  *pGraphNode = new hip::GraphMemsetNode(pMemsetParams);
+  *pGraphNode = new hip::GraphMemsetNode(pMemsetParams, depth);
   status = ihipGraphAddNode(*pGraphNode, graph, pDependencies, numDependencies, capture);
   return status;
 }
@@ -484,7 +489,7 @@ hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D*
   }
   p.dstArray = pCopy->dstArray;
   p.dstPos = {pCopy->dstXInBytes, pCopy->dstY, 0};
-  p.dstPtr.pitch = pCopy->srcPitch;
+  p.dstPtr.pitch = pCopy->dstPitch;
   if (pCopy->dstDevice != nullptr) {
     p.dstPtr.ptr = pCopy->dstDevice;
   }
@@ -719,6 +724,7 @@ hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitc
   memsetParams.width = width;
   memsetParams.height = height;
   memsetParams.pitch = pitch;
+  memsetParams.elementSize = 1;
   hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
   hip::GraphNode* pGraphNode;
   hipError_t status =
@@ -735,9 +741,25 @@ hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDe
                                    hipExtent& extent) {
   ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset3D on stream : %p",
           stream);
+  hipMemsetParams memsetParams = {0};
   if (!hip::isValid(stream)) {
     return hipErrorContextIsDestroyed;
   }
+  memsetParams.dst = pitchedDevPtr.ptr;
+  memsetParams.value = value;
+  memsetParams.width = extent.width;
+  memsetParams.height = extent.height;
+  memsetParams.pitch = pitchedDevPtr.pitch;
+  memsetParams.elementSize = 1;
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  hip::GraphNode* pGraphNode;
+  hipError_t status =
+      ihipGraphAddMemsetNode(&pGraphNode, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                             s->GetLastCapturedNodes().size(), &memsetParams, true, extent.depth);
+  if (status != hipSuccess) {
+    return status;
+  }
+  s->SetLastCapturedNode(pGraphNode);
   return hipSuccess;
 }
 
@@ -960,7 +982,7 @@ hipError_t hipStreamEndCapture_common(hipStream_t stream, hip::Graph** pGraph) {
   if (s->GetCaptureStatus() == hipStreamCaptureStatusNone) {
     return hipErrorIllegalState;
   }
-  // Capture must be ended on the same stream in which it was initiated
+   // Capture must be ended on the same stream in which it was initiated
   if (!s->IsOriginStream()) {
     return hipErrorStreamCaptureUnmatched;
   }
@@ -977,15 +999,17 @@ hipError_t hipStreamEndCapture_common(hipStream_t stream, hip::Graph** pGraph) {
     amd::ScopedLock lock(g_captureStreamsLock);
     g_captureStreams.erase(std::find(g_captureStreams.begin(), g_captureStreams.end(), s));
   }
+  {
+    amd::ScopedLock lock(g_streamSetLock);
+    g_allCapturingStreams.erase(
+        std::find(g_allCapturingStreams.begin(), g_allCapturingStreams.end(), s));
+  }
   // If capture was invalidated, due to a violation of the rules of stream capture
   if (s->GetCaptureStatus() == hipStreamCaptureStatusInvalidated) {
     *pGraph = nullptr;
     return hipErrorStreamCaptureInvalidated;
   }
-  {
-    amd::ScopedLock lock(g_streamSetLock);
-    g_allCapturingStreams.erase(std::find(g_allCapturingStreams.begin(), g_allCapturingStreams.end(), s));
-  }
+
   // check if all parallel streams have joined
   // Nodes that are removed from the dependency set via API hipStreamUpdateCaptureDependencies do
   // not result in hipErrorStreamCaptureUnjoined
@@ -1166,6 +1190,10 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph
   if (clonedNode == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
+  hipMemcpyKind oldkind =  reinterpret_cast<hip::GraphMemcpyNode1D*>(clonedNode)->GetMemcpyKind();
+  if (oldkind != kind) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
   HIP_RETURN(reinterpret_cast<hip::GraphMemcpyNode1D*>(clonedNode)->SetParams(dst, src,
                                                                               count, kind));
 }
@@ -1555,6 +1583,12 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
   if (clonedNode == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
+
+  hipMemcpyKind oldkind =  reinterpret_cast<hip::GraphMemcpyNode*>(clonedNode)->GetMemcpyKind();
+  hipMemcpyKind newkind =  pNodeParams->kind;
+  if (oldkind != newkind) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
   HIP_RETURN(reinterpret_cast<hip::GraphMemcpyNode*>(clonedNode)->SetParams(pNodeParams));
 }
 
@@ -2088,6 +2122,11 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
   if (clonedNode == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
+
+  hipMemcpyKind oldkind =  reinterpret_cast<hip::GraphMemcpyNodeFromSymbol*>(clonedNode)->GetMemcpyKind();
+  if (oldkind != kind) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
   constexpr bool kCheckDeviceIsSame = true;
   HIP_RETURN(reinterpret_cast<hip::GraphMemcpyNodeFromSymbol*>(clonedNode)
                  ->SetParams(dst, symbol, count, offset, kind, kCheckDeviceIsSame));
@@ -2158,6 +2197,10 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi
   if (clonedNode == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
+  hipMemcpyKind oldkind =  reinterpret_cast<hip::GraphMemcpyNodeToSymbol*>(clonedNode)->GetMemcpyKind();
+  if (oldkind != kind) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
   constexpr bool kCheckDeviceIsSame = true;
   HIP_RETURN(reinterpret_cast<hip::GraphMemcpyNodeToSymbol*>(clonedNode)
                  ->SetParams(symbol, src, count, offset, kind, kCheckDeviceIsSame));
@@ -2458,37 +2501,45 @@ hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, hipMemAllocNodePar
   HIP_RETURN(hipSuccess);
 }
 
-// ================================================================================================
-hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
-                                  const hipGraphNode_t* pDependencies, size_t numDependencies,
-                                  void* dev_ptr) {
-  HIP_INIT_API(hipGraphAddMemFreeNode, pGraphNode, graph, pDependencies, numDependencies, dev_ptr);
-  if (pGraphNode == nullptr || graph == nullptr ||
-      ((numDependencies > 0 && pDependencies == nullptr) ||
-       (pDependencies != nullptr && numDependencies == 0)) ||
-      dev_ptr == nullptr) {
-    HIP_RETURN(hipErrorInvalidValue);
-  }
-
+hipError_t ihipGraphAddMemFreeNode(hip::GraphNode** graphNode, hip::Graph* graph,
+                                   hip::GraphNode* const* pDependencies, size_t numDependencies,
+                                   void* dptr) {
   // Is memory passed to be free'd valid
   size_t offset = 0;
-  auto memory = getMemoryObject(dev_ptr, offset);
+  auto memory = getMemoryObject(dptr, offset);
   if (memory == nullptr) {
     if (HIP_MEM_POOL_USE_VM) {
       // When VM is on the address must be valid and may point to a VA object
-      memory = amd::MemObjMap::FindVirtualMemObj(dev_ptr);
+      memory = amd::MemObjMap::FindVirtualMemObj(dptr);
     }
     if (memory == nullptr) {
       HIP_RETURN(hipErrorInvalidValue);
     }
   }
 
-  auto mem_free_node = new hip::GraphMemFreeNode(dev_ptr);
-  hip::GraphNode* node = mem_free_node;
+  auto mem_free_node = new hip::GraphMemFreeNode(dptr);
+  *graphNode = mem_free_node;
   auto status =
-      ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
-                       reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
-  *pGraphNode = reinterpret_cast<hipGraphNode_t>(node);
+      ihipGraphAddNode(*graphNode, graph, pDependencies, numDependencies);
+  HIP_RETURN(status);
+}
+// ================================================================================================
+hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                  const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                  void* dev_ptr) {
+  HIP_INIT_API(hipGraphAddMemFreeNode, pGraphNode, graph, pDependencies, numDependencies, dev_ptr);
+  if (pGraphNode == nullptr || graph == nullptr ||
+      ((numDependencies > 0 && pDependencies == nullptr) ||
+       (pDependencies != nullptr && numDependencies == 0)) ||
+      dev_ptr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip::GraphNode* pNode;
+  auto status =
+      ihipGraphAddMemFreeNode(&pNode,
+                reinterpret_cast<hip::Graph*>(graph),
+                reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies, dev_ptr);
+  *pGraphNode = reinterpret_cast<hipGraphNode_t>(pNode);
   HIP_RETURN(status);
 }
 
@@ -2993,4 +3044,86 @@ hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraph
       nodeParams));
 }
 
+hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                  const hipGraphNode_t* dependencies, size_t numDependencies,
+                                  hipDeviceptr_t dptr) {
+  HIP_INIT_API(hipDrvGraphAddMemFreeNode, phGraphNode, hGraph, dependencies, numDependencies, dptr);
+  if (phGraphNode == nullptr || hGraph == nullptr ||
+      ((numDependencies > 0 && dependencies == nullptr) ||
+       (dependencies != nullptr && numDependencies == 0)) ||
+      dptr == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  // Is memory passed to be free'd valid
+  size_t offset = 0;
+  auto memory = getMemoryObject(dptr, offset);
+  if (memory == nullptr) {
+    if (HIP_MEM_POOL_USE_VM) {
+      // When VM is on the address must be valid and may point to a VA object
+      memory = amd::MemObjMap::FindVirtualMemObj(dptr);
+    }
+    if (memory == nullptr) {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
+  }
+  hip::GraphNode* pNode;
+  auto status =
+      ihipGraphAddMemFreeNode(&pNode,
+                    reinterpret_cast<hip::Graph*>(hGraph),
+                    reinterpret_cast<hip::GraphNode* const*>(dependencies), numDependencies, dptr);
+  *phGraphNode = reinterpret_cast<hipGraphNode_t>(pNode);
+  HIP_RETURN(status);
+}
+
+hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                   const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
+  HIP_INIT_API(hipDrvGraphExecMemcpyNodeSetParams, hGraphExec, hNode, copyParams);
+  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
+  if (hGraphExec == nullptr ||
+                    !hip::GraphNode::isNodeValid(reinterpret_cast<hip::GraphNode*>(n))) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  if (ihipDrvMemcpy3D_validate(copyParams) != hipSuccess) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  // Check if pNodeParams passed is a empty struct
+  if (((copyParams->srcArray == 0) && (copyParams->srcHost == nullptr)
+       && (copyParams->srcDevice == nullptr)) ||
+      ((copyParams->dstArray == 0) && (copyParams->dstHost == nullptr)
+       && (copyParams->dstDevice == nullptr))) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip::GraphNode* clonedNode = reinterpret_cast<hip::GraphExec*>(hGraphExec)->GetClonedNode(n);
+  if (clonedNode == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  HIP_RETURN(reinterpret_cast<hip::GraphDrvMemcpyNode*>(clonedNode)->SetParams(copyParams));
+}
+
+hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                   const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx) {
+  HIP_INIT_API(hipDrvGraphExecMemsetNodeSetParams, hGraphExec, hNode, memsetParams);
+  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
+
+  if (hGraphExec == nullptr || !hip::GraphNode::isNodeValid(n) || memsetParams == nullptr ||
+      memsetParams->dst == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hipMemsetParams pmemsetParams;
+  pmemsetParams.dst = reinterpret_cast<void*>(memsetParams->dst);
+  pmemsetParams.elementSize = memsetParams->elementSize;
+  pmemsetParams.height = memsetParams->height;
+  pmemsetParams.pitch = memsetParams->pitch;
+  pmemsetParams.value = memsetParams->value;
+  pmemsetParams.width = memsetParams->width;
+  if (ihipGraphMemsetParams_validate(&pmemsetParams) != hipSuccess) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  hip::GraphNode* clonedNode = reinterpret_cast<hip::GraphExec*>(hGraphExec)->GetClonedNode(n);
+  if (clonedNode == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+  HIP_RETURN(reinterpret_cast<hip::GraphMemsetNode*>(clonedNode)->SetParams(memsetParams, true));
+}
+
 }  // namespace hip
diff --git a/hipamd/src/hip_graph_helper.hpp b/hipamd/src/hip_graph_helper.hpp
index e77954d3d..c14662118 100644
--- a/hipamd/src/hip_graph_helper.hpp
+++ b/hipamd/src/hip_graph_helper.hpp
@@ -27,8 +27,6 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p);
 
 hipError_t ihipDrvMemcpy3D_validate(const HIP_MEMCPY3D* pCopy);
 
-hipError_t ihipDrvMemcpy3DParamValidate(const HIP_MEMCPY3D* pCopy);
-
 hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
 
 hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp
index ec42414f8..a571a1d62 100644
--- a/hipamd/src/hip_graph_internal.cpp
+++ b/hipamd/src/hip_graph_internal.cpp
@@ -395,18 +395,20 @@ hipError_t GraphExec::CaptureAQLPackets() {
       }
     }
 
-    if (device_kernarg_pool_ && !device->isXgmi()) {
-      if (device->info().hdpMemFlushCntl != nullptr) {
+    if (device_kernarg_pool_) {
+      auto kernArgImpl = device->settings().kernel_arg_impl_;
+
+      if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
         *device->info().hdpMemFlushCntl = 1u;
-        if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
-          LogError("Unexpected HDP Register readback value!");
-        }
-      } else {
-        amd::Command* command = new amd::Marker(*capture_stream_, true);
-        if (command != nullptr) {
-          command->enqueue();
-          command->release();
-        }
+        volatile auto kSentinel = *device->info().hdpMemFlushCntl;
+      } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback &&
+                 kernarg_pool_size_graph_ != 0) {
+        address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_;
+        volatile auto kSentinel = *(dev_ptr - 1);
+        _mm_sfence();
+        *(dev_ptr - 1) = kSentinel;
+        _mm_mfence();
+        kSentinel = *(dev_ptr - 1);
       }
     }
 
diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp
index a80bb71c7..97192ced9 100644
--- a/hipamd/src/hip_graph_internal.hpp
+++ b/hipamd/src/hip_graph_internal.hpp
@@ -233,7 +233,7 @@ struct GraphNode : public hipGraphNodeDOTAttribute {
   }
   // Return gpu packet address to update with actual packet under capture.
   uint8_t* GetAqlPacket() { return gpuPacket_; }
-  void SetKernelName(std::string kernelName) { capturedKernelName_ = kernelName; }
+  void SetKernelName(const std::string& kernelName) { capturedKernelName_ = kernelName; }
   const std::string& GetKernelName() const { return capturedKernelName_; }
 
   hip::Stream* GetQueue() const { return stream_; }
@@ -495,6 +495,7 @@ struct Graph {
     void* ptr;
     const auto& dev_info = g_devices[0]->devices()[0]->info();
 
+    size = amd::alignUp(size, dev_info.virtualMemAllocGranularity_);
     // Single virtual alloc would reserve for all devices.
     ptr = g_devices[0]->devices()[0]->virtualAlloc(startAddress, size,
             dev_info.virtualMemAllocGranularity_);
@@ -647,6 +648,8 @@ struct GraphExec {
   // Capture GPU Packets from graph commands
   hipError_t CaptureAQLPackets();
   hipError_t UpdateAQLPacket(hip::GraphKernelNode* node);
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 
 struct ChildGraphNode : public GraphNode {
@@ -828,7 +831,7 @@ class GraphKernelNode : public GraphNode {
     hipFunction_t func = getFunc(kernelParams_, ihipGetDevice());
     hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
     std::string label;
-    char buffer[500];
+    char buffer[4096];
     if (flag == hipGraphDebugDotFlagsVerbose) {
       sprintf(buffer,
               "{\n%s\n| {ID | %d | %s\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>}\n| {{node "
@@ -1071,7 +1074,7 @@ class GraphKernelNode : public GraphNode {
 
   hipError_t SetAttrParams(hipKernelNodeAttrID attr, const hipKernelNodeAttrValue* params) {
     hipDeviceProp_t prop = {0};
-    hipError_t status = ihipGetDeviceProperties(&prop, ihipGetDevice()); 
+    hipError_t status = ihipGetDeviceProperties(&prop, ihipGetDevice());
     if (hipSuccess != status){
       return status;
     }
@@ -1242,7 +1245,7 @@ class GraphMemcpyNode : public GraphNode {
     std::memcpy(params, &copyParams_, sizeof(hipMemcpy3DParms));
   }
 
-  virtual hipMemcpyKind GetMemcpyKind() const { return hipMemcpyDefault; };
+  virtual hipMemcpyKind GetMemcpyKind() const { return copyParams_.kind; };
 
   hipError_t SetParams(const hipMemcpy3DParms* params) {
     hipError_t status = ValidateParams(params);
@@ -1312,7 +1315,7 @@ class GraphMemcpyNode : public GraphNode {
     }
     std::string label;
     if (flag == hipGraphDebugDotFlagsMemcpyNodeParams || flag == hipGraphDebugDotFlagsVerbose) {
-      char buffer[500];
+      char buffer[4096];
       sprintf(
           buffer,
           "{\n%s\n| {{ID | node handle} | {%u | %p}}\n| {kind | %s}\n| {{srcPtr | dstPtr} | "
@@ -1490,7 +1493,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
     }
     std::string label;
     if (flag == hipGraphDebugDotFlagsMemcpyNodeParams || flag == hipGraphDebugDotFlagsVerbose) {
-      char buffer[500];
+      char buffer[4096];
       sprintf(
           buffer,
           "{\n%s\n| {{ID | node handle} | {%u | %p}}\n| {kind | %s}\n| {{srcPtr | dstPtr} | "
@@ -1678,9 +1681,16 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
     }
     size_t dOffset = 0;
     amd::Memory* srcMemory = getMemoryObject(src, dOffset);
+    cl_mem_flags srcFlag = 0;
+    if (srcMemory != nullptr) {
+      srcFlag = srcMemory->getMemFlags();
+      if (!IS_LINUX) {
+        srcFlag &= ~ROCCLR_MEM_INTERPROCESS;
+      }
+    }
     if (srcMemory == nullptr && kind != hipMemcpyHostToDevice && kind != hipMemcpyDefault) {
       return hipErrorInvalidValue;
-    } else if (srcMemory != nullptr && srcMemory->getMemFlags() == 0 &&
+    } else if (srcMemory != nullptr && srcFlag == 0 &&
                kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU
                && kind != hipMemcpyDefault) {
       return hipErrorInvalidValue;
@@ -1704,16 +1714,17 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
 };
 class GraphMemsetNode : public GraphNode {
   hipMemsetParams memsetParams_;
-
+  size_t depth_ = 1;
  public:
-  GraphMemsetNode(const hipMemsetParams* pMemsetParams)
+  GraphMemsetNode(const hipMemsetParams* pMemsetParams, size_t depth = 1)
       : GraphNode(hipGraphNodeTypeMemset, "solid", "invtrapezium", "MEMSET") {
     memsetParams_ = *pMemsetParams;
+    depth_ = depth;
     size_t sizeBytes = 0;
     if (memsetParams_.height == 1) {
       sizeBytes = memsetParams_.width * memsetParams_.elementSize;
     } else {
-      sizeBytes = memsetParams_.width * memsetParams_.height * memsetParams_.elementSize;
+      sizeBytes = memsetParams_.width * memsetParams_.height * depth_ * memsetParams_.elementSize;
     }
   }
 
@@ -1721,6 +1732,7 @@ class GraphMemsetNode : public GraphNode {
   // Copy constructor
   GraphMemsetNode(const GraphMemsetNode& memsetNode) : GraphNode(memsetNode) {
     memsetParams_ = memsetNode.memsetParams_;
+    depth_ = memsetNode.depth_;
   }
 
   GraphNode* clone() const override {
@@ -1730,20 +1742,20 @@ class GraphMemsetNode : public GraphNode {
   virtual std::string GetLabel(hipGraphDebugDotFlags flag) override {
     std::string label;
     if (flag == hipGraphDebugDotFlagsMemsetNodeParams || flag == hipGraphDebugDotFlagsVerbose) {
-      char buffer[500];
+      char buffer[4096];
       sprintf(buffer,
               "{\n%s\n| {{ID | node handle | dptr | pitch | value | elementSize | width | "
-              "height} | {%u | %p | %p | %zu | %u | %u | %zu | %zu}}}",
+              "height | depth} | {%u | %p | %p | %zu | %u | %u | %zu | %zu | %zu}}}",
               label_.c_str(), GetID(), this, memsetParams_.dst, memsetParams_.pitch,
               memsetParams_.value, memsetParams_.elementSize, memsetParams_.width,
-              memsetParams_.height);
+              memsetParams_.height, depth_);
       label = buffer;
     } else {
       size_t sizeBytes;
       if (memsetParams_.height == 1) {
         sizeBytes = memsetParams_.width * memsetParams_.elementSize;
       } else {
-        sizeBytes = memsetParams_.width * memsetParams_.height * memsetParams_.elementSize;
+        sizeBytes = memsetParams_.width * memsetParams_.height * depth_ * memsetParams_.elementSize;
       }
       label = std::to_string(GetID()) + "\n" + label_ + "\n(" +
           std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")";
@@ -1774,7 +1786,7 @@ class GraphMemsetNode : public GraphNode {
           {memsetParams_.dst, memsetParams_.pitch, memsetParams_.width * memsetParams_.elementSize,
            memsetParams_.height},
           memsetParams_.value,
-          {memsetParams_.width * memsetParams_.elementSize, memsetParams_.height, 1}, stream,
+          {memsetParams_.width * memsetParams_.elementSize, memsetParams_.height, depth_}, stream,
           memsetParams_.elementSize);
     }
     return status;
@@ -1793,12 +1805,15 @@ class GraphMemsetNode : public GraphNode {
     params->width = memsetParams_.width;
   }
 
-  hipError_t SetParamsInternal(const hipMemsetParams* params, bool isExec) {
+  hipError_t SetParamsInternal(const hipMemsetParams* params, bool isExec, size_t depth = 1) {
     hipError_t hip_error = hipSuccess;
     hip_error = ihipGraphMemsetParams_validate(params);
     if (hip_error != hipSuccess) {
       return hip_error;
     }
+    if (depth == 0) {
+      return hipErrorInvalidValue;
+    }
     if (isExec) {
       size_t discardOffset = 0;
       amd::Memory *memObj = getMemoryObject(params->dst, discardOffset);
@@ -1829,7 +1844,7 @@ class GraphMemsetNode : public GraphNode {
         // 2D - hipGraphExecMemsetNodeSetParams returns invalid value if new width or new height is
         // not same as what memset node is added with.
         if (memsetParams_.width * memsetParams_.elementSize != params->width * params->elementSize
-         || memsetParams_.height != params->height) {
+         || memsetParams_.height != params->height || depth != depth_) {
           return hipErrorInvalidValue;
         }
       } else {
@@ -1839,26 +1854,30 @@ class GraphMemsetNode : public GraphNode {
         amd::Memory *memObj = getMemoryObject(params->dst, discardOffset);
         if (memObj != nullptr) {
           if (params->width * params->elementSize > memObj->getUserData().width_
-           || params->height > memObj->getUserData().height_) {
+           || params->height > memObj->getUserData().height_
+           || depth > memObj->getUserData().depth_) {
             return hipErrorInvalidValue;
            }
         }
        }
-      sizeBytes = params->width * params->elementSize * params->height * 1;
+      sizeBytes = params->width * params->elementSize * params->height * depth;
       hip_error = ihipMemset3D_validate(
           {params->dst, params->pitch, params->width * params->elementSize, params->height},
-          params->value, {params->width * params->elementSize, params->height, 1}, sizeBytes);
+          params->value, {params->width * params->elementSize, params->height, depth}, sizeBytes);
     }
     if (hip_error != hipSuccess) {
       return hip_error;
     }
     std::memcpy(&memsetParams_, params, sizeof(hipMemsetParams));
+    depth_ = depth;
     return hipSuccess;
   }
-  hipError_t SetParams(const hipMemsetParams* params, bool isExec = false) {
-    return SetParamsInternal(params, isExec);
+
+  hipError_t SetParams(const hipMemsetParams* params, bool isExec = false, size_t depth = 1) {
+    return SetParamsInternal(params, isExec, depth);
   }
-  hipError_t SetParams(const HIP_MEMSET_NODE_PARAMS* params, bool isExec = false) {
+
+  hipError_t SetParams(const HIP_MEMSET_NODE_PARAMS* params, bool isExec = false, size_t depth = 1) {
     hipMemsetParams pmemsetParams;
     pmemsetParams.dst = params->dst;
     pmemsetParams.elementSize = params->elementSize;
@@ -1866,11 +1885,11 @@ class GraphMemsetNode : public GraphNode {
     pmemsetParams.pitch = params->pitch;
     pmemsetParams.value = params->value;
     pmemsetParams.width = params->width;
-    return SetParamsInternal(&pmemsetParams, isExec);
+    return SetParamsInternal(&pmemsetParams, isExec, depth);
   }
   hipError_t SetParams(GraphNode* node) override {
     const GraphMemsetNode* memsetNode = static_cast<GraphMemsetNode const*>(node);
-    return SetParams(&memsetNode->memsetParams_);
+    return SetParams(&memsetNode->memsetParams_, false, memsetNode->depth_);
   }
 };
 
@@ -2106,9 +2125,10 @@ class GraphMemAllocNode final : public GraphNode {
       size_ = aligned_size;
       // Execute the original mapping command
       VirtualMapCommand::submit(device);
+      queue()->device().SetMemAccess(va_->getSvmPtr(), aligned_size, amd::Device::VmmAccess::kReadWrite);
       va_->retain();
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute: %p, %p",
-          va_->getSvmPtr(), memory());
+      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
+          va_->getSvmPtr(), reinterpret_cast<char*>(va_->getSvmPtr()) + aligned_size, memory());
     }
 
    private:
diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in
index 9bfa41d49..2ec315cba 100644
--- a/hipamd/src/hip_hcc.map.in
+++ b/hipamd/src/hip_hcc.map.in
@@ -560,6 +560,9 @@ local:
 hip_6.2 {
 global:
     hipGetFuncBySymbol;
+    hipDrvGraphExecMemcpyNodeSetParams;
+    hipDrvGraphExecMemsetNodeSetParams;
+    hipDrvGraphAddMemFreeNode;
 local:
     *;
 } hip_6.1;
diff --git a/hipamd/src/hip_hmm.cpp b/hipamd/src/hip_hmm.cpp
index 908e030d8..c6e5733b2 100644
--- a/hipamd/src/hip_hmm.cpp
+++ b/hipamd/src/hip_hmm.cpp
@@ -89,7 +89,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
   size_t offset = 0;
   amd::Memory* memObj = getMemoryObject(dev_ptr, offset);
 
-  if (memObj == nullptr || (memObj && count > (memObj->getSize() - offset))) {
+  if ((memObj != nullptr) && (count  > (memObj->getSize() - offset))) {
     HIP_RETURN(hipErrorInvalidValue);
   }
   if (device != hipCpuDeviceId && (static_cast<size_t>(device) >= g_devices.size())) {
diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp
index 46a03e0b4..1c3f23044 100644
--- a/hipamd/src/hip_internal.hpp
+++ b/hipamd/src/hip_internal.hpp
@@ -180,23 +180,48 @@ const char* ihipGetErrorName(hipError_t hip_error);
     }                                    \
   } while (0);
 
+// During stream capture some actions, such as a call to hipMalloc, may be unsafe and prohibited
+// during capture. It is allowed only in relaxed mode.
 #define CHECK_STREAM_CAPTURE_SUPPORTED()                                                           \
   if (hip::tls.stream_capture_mode_ == hipStreamCaptureModeThreadLocal) {                          \
     if (hip::tls.capture_streams_.size() != 0) {                                                   \
+      for (auto stream : hip::tls.capture_streams_) {                                              \
+        stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);                               \
+      }                                                                                            \
       HIP_RETURN(hipErrorStreamCaptureUnsupported);                                                \
     }                                                                                              \
   } else if (hip::tls.stream_capture_mode_ == hipStreamCaptureModeGlobal) {                        \
     if (hip::tls.capture_streams_.size() != 0) {                                                   \
+      for (auto stream : hip::tls.capture_streams_) {                                              \
+        stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);                               \
+      }                                                                                            \
       HIP_RETURN(hipErrorStreamCaptureUnsupported);                                                \
     }                                                                                              \
     if (g_captureStreams.size() != 0) {                                                            \
+      for (auto stream : g_captureStreams) {                                                       \
+        stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);                               \
+      }                                                                                            \
       HIP_RETURN(hipErrorStreamCaptureUnsupported);                                                \
     }                                                                                              \
   }
 
-// Sync APIs cannot be called when stream capture is active
+// Device sync is not supported during capture
+#define CHECK_SUPPORTED_DURING_CAPTURE()                                                           \
+  if (!g_allCapturingStreams.empty()) {                                                            \
+    for (auto stream : g_allCapturingStreams) {                                                    \
+      stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);                                 \
+    }                                                                                              \
+    return hipErrorStreamCaptureUnsupported;                                                       \
+  }
+
+// Sync APIs like hipMemset, hipMemcpy etc.. cannot be called when stream capture is active
+// for all capture modes hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal and
+// hipStreamCaptureModeRelaxed
 #define CHECK_STREAM_CAPTURING()                                                                   \
-  if (!g_captureStreams.empty()) {                                                                 \
+  if (!g_allCapturingStreams.empty()) {                                                            \
+    for (auto stream : g_allCapturingStreams) {                                                    \
+      stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);                                 \
+    }                                                                                              \
     return hipErrorStreamCaptureImplicit;                                                          \
   }
 
@@ -207,6 +232,10 @@ const char* ihipGetErrorName(hipError_t hip_error);
           hipStreamCaptureStatusActive) {                                                          \
     hipError_t status = hip::capture##name(stream, ##__VA_ARGS__);                                 \
     return status;                                                                                 \
+  } else if (stream != nullptr &&                                                                  \
+             reinterpret_cast<hip::Stream*>(stream)->GetCaptureStatus() ==                         \
+                 hipStreamCaptureStatusInvalidated) {                                              \
+    return hipErrorStreamCaptureInvalidated;                                                       \
   }
 
 #define PER_THREAD_DEFAULT_STREAM(stream)                                                         \
@@ -306,17 +335,10 @@ class stream_per_thread {
     /// Returns the CU mask for the current stream
     const std::vector<uint32_t> GetCUMask() const { return cuMask_; }
 
-    /// Sync all streams
-    static void SyncAllStreams(int deviceId, bool cpu_wait = true);
-
     /// Check whether any blocking stream running
     static bool StreamCaptureBlocking();
 
-    /// Destroy all streams on a given device
-    static void destroyAllStreams(int deviceId);
-
     static void Destroy(hip::Stream* stream);
-
     /// Check Stream Capture status to make sure it is done
     static bool StreamCaptureOngoing(hipStream_t hStream);
 
@@ -416,7 +438,6 @@ class stream_per_thread {
         parallelCaptureStreams_.erase(it);
       }
     }
-    static bool existsActiveStreamForDevice(hip::Device* device);
 
     /// The stream should be destroyed via release() rather than delete
     private:
@@ -424,8 +445,10 @@ class stream_per_thread {
   };
 
   /// HIP Device class
-  class Device {
+  class Device : public amd::ReferenceCountedObject {
     amd::Monitor lock_{"Device lock", true};
+    amd::Monitor streamSetLock{"Guards device stream set"};
+    std::unordered_set<hip::Stream*> streamSet;
     /// ROCclr context
     amd::Context* context_;
     /// Device's ID
@@ -499,7 +522,7 @@ class stream_per_thread {
       amd::ScopedLock lock(lock_);
       /// Either stream is active or device is active
       if (isActive_) return true;
-      if (Stream::existsActiveStreamForDevice(this)) {
+      if (existsActiveStreamForDevice()) {
         isActive_ = true;
         return true;
       }
@@ -540,6 +563,22 @@ class stream_per_thread {
 
     /// Returns true if memory pool is valid on this device
     bool IsMemoryPoolValid(MemoryPool* pool);
+    void AddStream(Stream* stream);
+
+    void RemoveStream(Stream* stream);
+
+    bool StreamExists(Stream* stream);
+
+    void destroyAllStreams();
+
+    void SyncAllStreams( bool cpu_wait = true);
+
+    bool StreamCaptureBlocking();
+
+    bool existsActiveStreamForDevice();
+  /// Wait all active streams on the blocking queue. The method enqueues a wait command and
+  /// doesn't stall the current thread
+    void WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false);
   };
 
   /// Thread Local Storage Variables Aggregator Class
@@ -589,10 +628,6 @@ class stream_per_thread {
 
   extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_data);
 
-  /// Wait all active streams on the blocking queue. The method enqueues a wait command and
-  /// doesn't stall the current thread
-  extern void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false);
-
   extern std::vector<hip::Device*> g_devices;
   extern hipError_t ihipDeviceGetCount(int* count);
   extern int ihipGetDevice();
diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp
index 613908501..bfe00f6ec 100644
--- a/hipamd/src/hip_memory.cpp
+++ b/hipamd/src/hip_memory.cpp
@@ -76,7 +76,7 @@ hipError_t ihipFree(void *ptr) {
   if (memory_object != nullptr) {
     // Wait on the device, associated with the current memory object during allocation
     auto device_id = memory_object->getUserData().deviceId;
-    hip::Stream::SyncAllStreams(device_id);
+    g_devices[device_id]->SyncAllStreams();
 
     // Find out if memory belongs to any memory pool
     if (!g_devices[device_id]->FreeMemory(memory_object, nullptr)) {
@@ -367,11 +367,25 @@ hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes,
   amd::Memory* srcMemory = getMemoryObject(src, sOffset);
   size_t dOffset = 0;
   amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
+
+  // If the mem object is a VMM sub buffer (subbuffer has parent set),
+  // then use parent's size for validation.
+  if (srcMemory && srcMemory->parent() && (srcMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
+      srcMemory = srcMemory->parent();
+  }
+
+  // If the mem object is a VMM sub buffer (subbuffer has parent set),
+  // then use parent's size for validation.
+  if (dstMemory && dstMemory->parent() && (dstMemory->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
+      dstMemory = dstMemory->parent();
+  }
+
   // Return error if sizeBytes passed to memcpy is more than the actual size allocated
   if ((dstMemory && sizeBytes > (dstMemory->getSize() - dOffset)) ||
       (srcMemory && sizeBytes > (srcMemory->getSize() - sOffset))) {
     return hipErrorInvalidValue;
   }
+
   //If src and dst ptr are null then kind must be either h2h or def.
   if (!IsHtoHMemcpyValid(dst, src, kind)) {
     return hipErrorInvalidValue;
@@ -391,8 +405,8 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::NONE);
   if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
     hip::Stream* pStream = &stream;
-    if (queueDevice != dstMemory->getContext().devices()[0]) {
-      pStream = hip::getNullStream(dstMemory->getContext());
+    if (queueDevice != dstMemory->GetDeviceById()) {
+      pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
       amd::Command* cmd = stream.getLastQueuedCommand(true);
       if (cmd != nullptr) {
         waitList.push_back(cmd);
@@ -402,8 +416,8 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
               *dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0, copyMetadata);
   } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
     hip::Stream* pStream = &stream;
-    if (queueDevice != srcMemory->getContext().devices()[0]) {
-      pStream = hip::getNullStream(srcMemory->getContext());
+    if (queueDevice != srcMemory->GetDeviceById()) {
+      pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
       amd::Command* cmd = stream.getLastQueuedCommand(true);
       if (cmd != nullptr) {
         waitList.push_back(cmd);
@@ -415,7 +429,7 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
     // Check if the queue device doesn't match the device on any memory object.
     // And any of them are not host allocation.
     // Hence it's a P2P transfer, because the app has requested access to another GPU
-    if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) &&
+    if ((srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) &&
         ((srcMemory->getContext().devices().size() == 1) &&
          (dstMemory->getContext().devices().size() == 1))) {
       command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList,
@@ -431,26 +445,26 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
       }
     } else {
       hip::Stream* pStream = &stream;
-      if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
+      if ((srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) &&
           (queueDevice != srcMemory->getContext().devices()[0])) {
-        pStream = hip::getNullStream(srcMemory->getContext());
+        pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
         amd::Command* cmd = stream.getLastQueuedCommand(true);
         if (cmd != nullptr) {
           waitList.push_back(cmd);
         }
-      } else if (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
+      } else if (srcMemory->GetDeviceById() != dstMemory->GetDeviceById()) {
         // Scenarios such as DtoH where dst is pinned memory
         if ((queueDevice != srcMemory->getContext().devices()[0]) &&
             (dstMemory->getContext().devices().size() != 1)) {
-          pStream = hip::getNullStream(srcMemory->getContext());
+          pStream = hip::getNullStream(srcMemory->GetDeviceById()->context());
           amd::Command* cmd = stream.getLastQueuedCommand(true);
           if (cmd != nullptr) {
             waitList.push_back(cmd);
           }
         // Scenarios such as HtoD where src is pinned memory
-        } else if ((queueDevice != dstMemory->getContext().devices()[0]) &&
+        } else if ((queueDevice != dstMemory->GetDeviceById()) &&
                    (srcMemory->getContext().devices().size() != 1)) {
-          pStream = hip::getNullStream(dstMemory->getContext());
+          pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
           amd::Command* cmd = stream.getLastQueuedCommand(true);
           if (cmd != nullptr) {
             waitList.push_back(cmd);
@@ -516,7 +530,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
   } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
              ((srcMemory != nullptr) && (dstMemory == nullptr))) {
     isHostAsync = false;
-  } else if (srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) {
+  } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
     hipMemoryType srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
         srcMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
     hipMemoryType dstMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
@@ -536,7 +550,7 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
   if (!isHostAsync) {
     command->awaitCompletion();
   } else if (!isGPUAsync) {
-    hip::Stream* pStream = hip::getNullStream(dstMemory->getContext());
+    hip::Stream* pStream = hip::getNullStream(dstMemory->GetDeviceById()->context());
     amd::Command::EventWaitList waitList;
     waitList.push_back(command);
     amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList);
@@ -575,6 +589,8 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
     ihipFlags = CL_MEM_SVM_ATOMICS;
   } else if (flags == hipDeviceMallocUncached) {
     ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_UNCACHED;
+  } else if (flags == hipDeviceMallocContiguous) {
+    ihipFlags = ROCCLR_MEM_HSA_CONTIGUOUS | ROCCLR_MEM_HSA_UNCACHED;
   } else if (flags == hipMallocSignalMemory) {
     ihipFlags = CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER | ROCCLR_MEM_HSA_SIGNAL_MEMORY;
     if (sizeBytes != 8) {
@@ -743,7 +759,7 @@ hipError_t ihipArrayDestroy(hipArray_t array) {
 
   auto image = as_amd(memObj);
   // Wait on the device, associated with the current memory object during allocation
-  hip::Stream::SyncAllStreams(image->getUserData().deviceId);
+  g_devices[image->getUserData().deviceId]->SyncAllStreams();
   image->release();
 
   delete array;
@@ -1252,7 +1268,7 @@ hipError_t ihipHostUnregister(void* hostPtr) {
 
   if (mem != nullptr) {
     // Wait on the device, associated with the current memory object during allocation
-    hip::Stream::SyncAllStreams(mem->getUserData().deviceId);
+    g_devices[mem->getUserData().deviceId]->SyncAllStreams();
 
     amd::MemObjMap::RemoveMemObj(hostPtr);
     for (const auto& device: g_devices) {
@@ -2263,7 +2279,7 @@ void ihipCopyMemParamSet(const HIP_MEMCPY3D* pCopy, hipMemoryType& srcMemType,
                     hipMemoryTypeHost;
 
     if (dstMemoryType == hipMemoryTypeDevice) {
-      const_cast<HIP_MEMCPY3D*>(pCopy)->dstDevice = const_cast<void*>(pCopy->dstDevice);
+      const_cast<HIP_MEMCPY3D*>(pCopy)->dstDevice = const_cast<void*>(pCopy->dstHost);
     }
   }
   srcMemType = srcMemoryType;
@@ -2882,53 +2898,6 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p) {
   return hipSuccess;
 }
 
-hipError_t ihipDrvMemcpy3DParamValidate(const HIP_MEMCPY3D* p) {
-  // Passing more than one non-zero source or destination will cause hipMemcpy3D() to
-  // return an error.
-  if (p == nullptr || ((p->srcArray != nullptr) && (p->srcHost != nullptr)) ||
-      ((p->dstArray != nullptr) && (p->dstHost != nullptr))) {
-    return hipErrorInvalidValue;
-  }
-  // The struct passed to hipMemcpy3D() must specify one of srcArray or srcPtr and one of dstArray
-  // or dstPtr.
-  if (((p->srcArray == nullptr) && (p->srcHost == nullptr)) ||
-      ((p->dstArray == nullptr) && (p->dstHost == nullptr))) {
-    return hipErrorInvalidValue;
-  }
-
-  // If the source and destination are both arrays, hipMemcpy3D() will return an error if they do
-  // not have the same element size.
-  if (((p->srcArray != nullptr) && (p->dstArray != nullptr)) &&
-      (hip::getElementSize(p->srcArray) != hip::getElementSize(p->dstArray))) {
-    return hipErrorInvalidValue;
-  }
-
-  // dst/src pitch must be less than max pitch
-  auto* deviceHandle = g_devices[hip::getCurrentDevice()->deviceId()]->devices()[0];
-  const auto& info = deviceHandle->info();
-  constexpr auto int32_max = static_cast<uint64_t>(std::numeric_limits<int32_t>::max());
-  auto maxPitch = std::min(info.maxMemAllocSize_, int32_max);
-
-  // negative pitch cases
-  if (p->srcPitch >= maxPitch || p->dstPitch >= maxPitch) {
-    return hipErrorInvalidValue;
-  }
-
-  if (p->dstArray == nullptr && p->srcArray == nullptr) {
-    if ((p->WidthInBytes + p->srcXInBytes > p->srcPitch) ||
-        (p->WidthInBytes + p->dstXInBytes > p->dstPitch)) {
-      return hipErrorInvalidValue;
-    }
-  }
-  if (p->srcMemoryType < hipMemoryTypeHost || p->srcMemoryType > hipMemoryTypeManaged) {
-    return hipErrorInvalidMemcpyDirection;
-  }
-  if (p->dstMemoryType < hipMemoryTypeHost || p->dstMemoryType > hipMemoryTypeManaged) {
-    return hipErrorInvalidMemcpyDirection;
-  }
-  return hipSuccess;
-}
-
 hipError_t ihipDrvMemcpy3D_validate(const HIP_MEMCPY3D* pCopy) {
   hipError_t status;
   if (pCopy->WidthInBytes == 0 || pCopy->Height == 0 || pCopy->Depth == 0) {
@@ -3786,8 +3755,12 @@ hipError_t ihipPointerGetAttributes(void* data, hipPointer_attribute attribute,
         break;
       }
       case HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS : {
-        // This attribute is ideally used in hipPointerSetAttribute, defaults to true
-        *reinterpret_cast<bool*>(data) = true;
+        if (memObj) {
+          *reinterpret_cast<bool*>(data) = memObj->getUserData().sync_mem_ops_;
+        } else {
+          *reinterpret_cast<bool*>(data) = false;
+          return hipErrorInvalidValue;
+        }
         break;
       }
       case HIP_POINTER_ATTRIBUTE_BUFFER_ID : {
@@ -4304,7 +4277,7 @@ hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) {
 
   auto image = as_amd(mem_obj);
   // Wait on the device, associated with the current memory object during allocation
-  hip::Stream::SyncAllStreams(image->getUserData().deviceId);
+  g_devices[image->getUserData().deviceId]->SyncAllStreams();
   image->release();
 
   delete mipmapped_array_ptr;
diff --git a/hipamd/src/hip_mempool_impl.cpp b/hipamd/src/hip_mempool_impl.cpp
index 8b30e8985..dc091c71e 100644
--- a/hipamd/src/hip_mempool_impl.cpp
+++ b/hipamd/src/hip_mempool_impl.cpp
@@ -41,7 +41,8 @@ void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) {
 }
 
 // ================================================================================================
-amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic, void* dptr) {
+amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic,
+    void* dptr, MemoryTimestamp* ts) {
   amd::Memory* memory = nullptr;
   auto start = allocations_.lower_bound({size, nullptr});
   for (auto it = start; it != allocations_.end();) {
@@ -61,6 +62,8 @@ amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic, v
     if (check_address && (it->second.IsSafeFind(stream, opp_mode))) {
       memory = it->first.second;
       total_size_ -= memory->getSize();
+      // Preserve event, since the logic could skip GPU wait on reuse
+      ts->event_ = it->second.event_;
       // Remove found allocation from the map
       it = allocations_.erase(it);
       break;
@@ -79,8 +82,6 @@ bool Heap::RemoveMemory(amd::Memory* memory, MemoryTimestamp* ts) {
       // Preserve timestamp info for possible reuse later
       *ts = it->second;
     } else {
-      // Runtime will delete the timestamp object, hence make sure HIP event is released
-      it->second.Wait();
       it->second.SetEvent(nullptr);
     }
     total_size_ -= mem_size;
@@ -169,7 +170,8 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
   amd::ScopedLock lock(lock_pool_ops_);
 
   void* dev_ptr = nullptr;
-  amd::Memory* memory = free_heap_.FindMemory(size, stream, Opportunistic(), dptr);
+  MemoryTimestamp ts;
+  amd::Memory* memory = free_heap_.FindMemory(size, stream, Opportunistic(), dptr, &ts);
   if (memory == nullptr) {
     if (Properties().maxSize != 0 && (max_total_size_ + size) > Properties().maxSize) {
       return nullptr;
@@ -180,6 +182,7 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
       return nullptr;
     }
     cl_svm_mem_flags flags = (state_.interprocess_) ? ROCCLR_MEM_INTERPROCESS : 0;
+    flags |= (state_.phys_mem_) ? ROCCLR_MEM_PHYMEM : 0;
     dev_ptr = amd::SvmBuffer::malloc(*context, flags, size, dev_info.memBaseAddrAlign_, nullptr);
     if (dev_ptr == nullptr) {
       size_t free = 0, total =0;
@@ -206,12 +209,11 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
       }
     }
   } else {
-    free_heap_.RemoveMemory(memory);
-    const device::Memory* dev_mem = memory->getDeviceMemory(*device_->devices()[0]);
-    dev_ptr = reinterpret_cast<void*>(dev_mem->virtualAddress());
+    dev_ptr = memory->getSvmPtr();
   }
   // Place the allocated memory into the busy heap
-  busy_heap_.AddMemory(memory, stream);
+  ts.AddSafeStream(stream);
+  busy_heap_.AddMemory(memory, ts);
 
   max_total_size_ = std::max(max_total_size_, busy_heap_.GetTotalSize() +
                                                   free_heap_.GetTotalSize());
@@ -256,14 +258,17 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
     }
     ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool FreeMem: %p, %p", memory->getSvmPtr(), memory);
 
-    if (stream == nullptr) {
+    if (memory->getUserData().vaddr_mem_obj != nullptr) {
+      auto va_mem = memory->getUserData().vaddr_mem_obj;
+      if (stream == nullptr) {
         stream = g_devices[memory->getUserData().deviceId]->NullStream();
+      }
+      // Unmap virtual address from memory
+      auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
+                                            va_mem->getSvmPtr(), va_mem->getSize(), nullptr);
+      cmd->enqueue();
+      cmd->release();
     }
-    // Unmap virtual address from memory
-    auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
-                                          memory->getSvmPtr(), memory->getSize(), nullptr);
-    cmd->enqueue();
-    cmd->release();
 
     if (stream != nullptr) {
       // The stream of destruction is a safe stream, because the app must handle sync
@@ -291,7 +296,7 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
 
   // Decrement the reference counter on the pool.
   // Note: It may delete memory pool for the last allocation. Thus, the scope lock can't include
-  // this call. 
+  // this call.
   release();
 
   return true;
diff --git a/hipamd/src/hip_mempool_impl.hpp b/hipamd/src/hip_mempool_impl.hpp
index 536e24f02..e397ceb79 100644
--- a/hipamd/src/hip_mempool_impl.hpp
+++ b/hipamd/src/hip_mempool_impl.hpp
@@ -38,13 +38,11 @@ struct SharedMemPointer {
 };
 
 struct MemoryTimestamp {
-  MemoryTimestamp(hip::Stream* stream): event_(nullptr) {
+  MemoryTimestamp(hip::Stream* stream = nullptr) {
     if (stream != nullptr) {
       safe_streams_.insert(stream);
     }
   }
-  MemoryTimestamp(): event_(nullptr) {}
-
   /// Adds a safe stream to the list of stream for possible reuse
   void AddSafeStream(Stream* event_stream, Stream* wait_stream = nullptr) {
     if (wait_stream == nullptr) {
@@ -59,6 +57,8 @@ struct MemoryTimestamp {
   }
   /// Changes last known valid event asociated with memory
   void SetEvent(hip::Event* event) {
+    // Runtime will delete the HIP event, hence make sure GPU is done with it
+    Wait();
     delete event_;
     event_ = event;
   }
@@ -94,7 +94,7 @@ struct MemoryTimestamp {
   }
 
   std::unordered_set<hip::Stream*>  safe_streams_;  //!< Safe streams for memory reuse
-  hip::Event*   event_;   //!< Last known HIP event, associated with the memory object
+  hip::Event*   event_ = nullptr;   //!< Last known HIP event, associated with the memory object
 };
 
 class Heap : public amd::EmbeddedObject {
@@ -112,7 +112,8 @@ class Heap : public amd::EmbeddedObject {
   void AddMemory(amd::Memory* memory, const MemoryTimestamp& ts);
 
   /// Finds memory object with the specified size
-  amd::Memory* FindMemory(size_t size, Stream* stream, bool opportunistic, void* dptr = nullptr);
+  amd::Memory* FindMemory(size_t size, Stream* stream, bool opportunistic,
+    void* dptr, MemoryTimestamp* ts);
 
   /// Removes allocation from the map
   bool RemoveMemory(amd::Memory* memory, MemoryTimestamp* ts = nullptr);
@@ -157,7 +158,6 @@ class Heap : public amd::EmbeddedObject {
     }
   }
 
-
   /// Checks if memory belongs to this heap
   bool IsActiveMemory(amd::Memory* memory) const {
     return (allocations_.find({memory->getSize(), memory}) != allocations_.end());
@@ -196,7 +196,7 @@ class MemoryPool : public amd::ReferenceCountedObject {
     SharedAccess access_[kMaxMgpuAccess]; //!< The list of devices for access
   };
 
-  MemoryPool(hip::Device* device, const hipMemPoolProps* props = nullptr)
+  MemoryPool(hip::Device* device, const hipMemPoolProps* props = nullptr, bool phys_mem = false)
       : busy_heap_(device),
         free_heap_(device),
         lock_pool_ops_("Pool operations", true),
@@ -208,6 +208,7 @@ class MemoryPool : public amd::ReferenceCountedObject {
     state_.event_dependencies_ = 1;
     state_.opportunistic_ = 1;
     state_.internal_dependencies_ = 1;
+    state_.phys_mem_ = HIP_MEM_POOL_USE_VM && phys_mem;
     if (props != nullptr) {
       properties_ = *props;
     } else {
@@ -317,6 +318,7 @@ class MemoryPool : public amd::ReferenceCountedObject {
                                             //!< dependencies
       uint32_t interprocess_ : 1;   //!< Memory pool can be used in interprocess communications
       uint32_t graph_in_use_ : 1;   //!< Memory pool was used in a graph execution
+      uint32_t phys_mem_ : 1;       //!< Mempool is used for graphs and will have physical allocations
     };
     uint32_t value_;
   } state_;
diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp
index dfb082dca..5d458e3f4 100644
--- a/hipamd/src/hip_module.cpp
+++ b/hipamd/src/hip_module.cpp
@@ -66,10 +66,6 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned
   HIP_RETURN(PlatformState::instance().loadModule(module, 0, image));
 }
 
-extern hipError_t __hipExtractCodeObjectFromFatBinary(
-    const void* data, const std::vector<std::string>& devices,
-    std::vector<std::pair<const void*, size_t>>& code_objs);
-
 hipError_t hipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const char* name) {
   HIP_INIT_API(hipModuleGetFunction, hfunc, hmod, name);
 
@@ -92,10 +88,6 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t h
                               const char* name) {
   HIP_INIT_API(hipModuleGetGlobal, dptr, bytes, hmod, name);
 
-  if (dptr == nullptr || bytes == nullptr) {
-    // If either is nullptr, ignore it
-    HIP_RETURN(hipSuccess);
-  }
   if ((dptr == nullptr && bytes == nullptr) || name == nullptr || strlen(name) == 0) {
     HIP_RETURN(hipErrorInvalidValue);
   }
@@ -709,7 +701,11 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
                                              void** kernelParams, uint32_t sharedMemBytes,
                                              hipStream_t hStream) {
   if (!hip::isValid(hStream)) {
-    return hipErrorInvalidValue;
+    return hipErrorContextIsDestroyed;
+  }
+
+  if (f == nullptr) {
+    return hipErrorInvalidDeviceFunction;
   }
 
   hipFunction_t func = nullptr;
@@ -729,6 +725,10 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
     return hipErrorInvalidConfiguration;
   }
 
+  if (sharedMemBytes > device->info().localMemSizePerCU_) {
+    return hipErrorCooperativeLaunchTooLarge;
+  }
+
   return ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
                                 static_cast<uint32_t>(globalWorkSizeY),
                                 static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y,
diff --git a/hipamd/src/hip_platform.cpp b/hipamd/src/hip_platform.cpp
index e88458dc4..3c02198c9 100644
--- a/hipamd/src/hip_platform.cpp
+++ b/hipamd/src/hip_platform.cpp
@@ -175,7 +175,13 @@ void __hipRegisterTexture(
 }
 
 void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules) {
-  hipError_t err = PlatformState::instance().removeFatBinary(modules);
+  // By calling hipDeviceSynchronize ensure that all HSA signal handlers
+  // complete before removeFatBinary
+  hipError_t err = hipDeviceSynchronize();
+  if (err != hipSuccess) {
+    LogPrintfError("Error during hipDeviceSynchronize, error: %d", err);
+  }
+  err = PlatformState::instance().removeFatBinary(modules);
   guarantee((err == hipSuccess), "Cannot Unregister Fat Binary, error:%d", err);
 }
 
@@ -414,8 +420,14 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
   // If the best block size is smaller than the block size used to fit the maximum,
   // then we need to make the grid bigger for full occupancy.
   const int bestBlocksPerCU = alu_limited_threads / (*bestBlockSize);
+  uint32_t maxCUs = device.info().maxComputeUnits_;
+  if (wrkGrpInfo->isWGPMode_ == false && device.settings().enableWgpMode_ == true) {
+    maxCUs *= 2;
+  } else if ((wrkGrpInfo->isWGPMode_ == true && device.settings().enableWgpMode_ == false)) {
+    maxCUs /= 2;
+  }
   // Unless those blocks are further constrained by LDS size.
-  *numBlocksPerGrid = device.info().maxComputeUnits_ * std::min(bestBlocksPerCU, lds_occupancy_wgs);
+  *numBlocksPerGrid = (maxCUs * std::min(bestBlocksPerCU, lds_occupancy_wgs));
 
   return hipSuccess;
 }
@@ -699,11 +711,8 @@ void PlatformState::init() {
   initialized_ = true;
   for (auto& it : statCO_.modules_) {
     hipError_t err = digestFatBinary(it.first, it.second);
-    if (err == hipErrorNoBinaryForGpu) {
+    if (err != hipSuccess) {
       HIP_ERROR_PRINT(err, "continue parsing remaining modules");
-    } else if (err != hipSuccess) {
-      HIP_ERROR_PRINT(err);
-      return;
     }
   }
   for (auto& it : statCO_.vars_) {
@@ -782,7 +791,7 @@ hipError_t PlatformState::getDynGlobalVar(const char* hostVar, hipModule_t hmod,
                                           hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
   amd::ScopedLock lock(lock_);
 
-  if (hostVar == nullptr || dev_ptr == nullptr || size_ptr == nullptr) {
+  if (hostVar == nullptr) {
     return hipErrorInvalidValue;
   }
 
@@ -791,14 +800,20 @@ hipError_t PlatformState::getDynGlobalVar(const char* hostVar, hipModule_t hmod,
     LogPrintfError("Cannot find the module: 0x%x", hmod);
     return hipErrorNotFound;
   }
-  *dev_ptr = nullptr;
+  if (dev_ptr) {
+    *dev_ptr = nullptr;
+  }
   IHIP_RETURN_ONFAIL(it->second->getManagedVarPointer(hostVar, dev_ptr, size_ptr));
   // if dev_ptr is nullptr, hostvar is not in managed variable list
-  if (*dev_ptr == nullptr) {
+  if ((dev_ptr && *dev_ptr == nullptr) || (size_ptr && *size_ptr == 0)) {
     hip::DeviceVar* dvar = nullptr;
     IHIP_RETURN_ONFAIL(it->second->getDeviceVar(&dvar, hostVar));
-    *dev_ptr = dvar->device_ptr();
-    *size_ptr = dvar->size();
+    if (dev_ptr != nullptr) {
+      *dev_ptr = dvar->device_ptr();
+    }
+    if (size_ptr != nullptr) {
+      *size_ptr = dvar->size();
+    }
   }
   return hipSuccess;
 }
diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp
index 9d0475bc9..2e80e8e90 100644
--- a/hipamd/src/hip_stream.cpp
+++ b/hipamd/src/hip_stream.cpp
@@ -25,8 +25,6 @@
 #include "hip_prof_api.h"
 
 namespace hip {
-static amd::Monitor streamSetLock{"Guards global stream set"};
-static std::unordered_set<hip::Stream*> streamSet;
 
 // ================================================================================================
 Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream,
@@ -43,8 +41,7 @@ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream,
       originStream_(false),
       captureID_(0)
       {
-        amd::ScopedLock lock(streamSetLock);
-        streamSet.insert(this);
+        device_->AddStream(this);
       }
 
 // ================================================================================================
@@ -76,10 +73,7 @@ bool Stream::Create() {
 
 // ================================================================================================
 void Stream::Destroy(hip::Stream* stream) {
-  {
-    amd::ScopedLock lock(streamSetLock);
-    streamSet.erase(stream);
-  }
+  stream->device_->RemoveStream(stream);
   stream->release();
 }
 
@@ -95,11 +89,12 @@ bool isValid(hipStream_t& stream) {
   }
 
   hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
-  amd::ScopedLock lock(streamSetLock);
-  if (streamSet.find(s) == streamSet.end()) {
-    return false;
+  for (auto& device : g_devices) {
+    if (device->StreamExists(s)) {
+      return true;
+    }
   }
-  return true;
+  return false;
 }
 
 // ================================================================================================
@@ -122,144 +117,47 @@ int Stream::DeviceId(const hipStream_t hStream) {
 }
 
 // ================================================================================================
-void Stream::SyncAllStreams(int deviceId, bool cpu_wait) {
-  // Make a local copy to avoid stalls for GPU finish with multiple threads
-  std::vector<hip::Stream*> streams;
-  streams.reserve(streamSet.size());
-  {
-    amd::ScopedLock lock(streamSetLock);
-    for (auto it : streamSet) {
-      if (it->DeviceId() == deviceId) {
-        streams.push_back(it);
-        it->retain();
-      }
-    }
-  }
-  for (auto it : streams) {
-    it->finish(cpu_wait);
-    it->release();
-  }
-  // Release freed memory for all memory pools on the device
-  g_devices[deviceId]->ReleaseFreedMemory();
-}
 
 // ================================================================================================
 bool Stream::StreamCaptureBlocking() {
-  amd::ScopedLock lock(streamSetLock);
-  for (auto& it : streamSet) {
-    if (it->GetCaptureStatus() == hipStreamCaptureStatusActive && it->Flags() != hipStreamNonBlocking) {
+  for (auto& device : g_devices) {
+    if (device->StreamCaptureBlocking()) {
       return true;
     }
   }
   return false;
 }
 
-void Stream::destroyAllStreams(int deviceId) {
-  std::vector<Stream*> toBeDeleted;
-  {
-    amd::ScopedLock lock(streamSetLock);
-    for (auto& it : streamSet) {
-      if (it->Null() == false && it->DeviceId() == deviceId) {
-        toBeDeleted.push_back(it);
-      }
-    }
-  }
-  for (auto& it : toBeDeleted) {
-    hip::Stream::Destroy(it);
-  }
-}
-
 bool Stream::StreamCaptureOngoing(hipStream_t hStream) {
   hip::Stream* s = reinterpret_cast<hip::Stream*>(hStream);
-  // Allow capture to be less restrictive one one changes the stream capture interaction
-  // mode for the thread
-  if (hip::tls.stream_capture_mode_ == hipStreamCaptureModeRelaxed) {
-    return false;
-  }
-  // If any local thread has an ongoing or concurrent capture sequence initiated
-  // with hipStreamCaptureModeGlobal, it is prohibited from unsafe calls
-  if (s != nullptr && s->GetCaptureMode() == hipStreamCaptureModeGlobal) {
-    amd::ScopedLock lock(g_captureStreamsLock);
-    return (g_captureStreams.empty() == true && hip::tls.capture_streams_.empty()) ? false : true;
-  }
-  else {
-    amd::ScopedLock lock(g_streamSetLock);
-    return (g_allCapturingStreams.find(s) == g_allCapturingStreams.end() ? false : true);
-  }
-}
-
-bool Stream::existsActiveStreamForDevice(hip::Device* device) {
-
-  amd::ScopedLock lock(streamSetLock);
-
-  for (const auto& active_stream : streamSet) {
-    if ((active_stream->GetDevice() == device) &&
-      active_stream->GetQueueStatus()) {
-      return true;
+  if (s != nullptr && s->GetCaptureStatus() == hipStreamCaptureStatusNone) {
+    // If current thread is capturing in relaxed mode
+    if (hip::tls.stream_capture_mode_ == hipStreamCaptureModeRelaxed) {
+      return false;
     }
-  }
-  return false;
-}
-
-// ================================================================================================
-void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) {
-  amd::Command::EventWaitList eventWaitList(0);
-  bool submitMarker = 0;
-
-  auto waitForStream = [&submitMarker,
-                         &eventWaitList](hip::Stream* stream) {
-    if (amd::Command *command = stream->getLastQueuedCommand(true)) {
-      amd::Event &event = command->event();
-      // Check HW status of the ROCcrl event.
-      // Note: not all ROCclr modes support HW status
-      bool ready = stream->device().IsHwEventReady(event);
-      if (!ready) {
-        ready = (command->status() == CL_COMPLETE);
-      }
-      submitMarker |= stream->vdev()->isFenceDirty();
-      // Check the current active status
-      if (!ready) {
-        command->notifyCmdQueue();
-        eventWaitList.push_back(command);
-      } else {
-        command->release();
+    // If any stream in current/concurrent thread is capturing in global mode
+    amd::ScopedLock lock(g_captureStreamsLock);
+    if (!g_captureStreams.empty()) {
+      for (auto stream : hip::g_captureStreams) {
+        stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);
       }
+      return true;
     }
-  };
-
-  if (wait_null_stream) {
-    if (hip::Stream* null_stream = blocking_stream->GetDevice()->GetNullStream()) {
-      waitForStream(null_stream);
-    }
-  } else {
-    amd::ScopedLock lock(streamSetLock);
-
-    for (const auto& active_stream : streamSet) {
-      // If it's the current device
-      if ((&active_stream->device() == &blocking_stream->device()) &&
-        // Make sure it's a default stream
-        ((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
-        // and it's not the current stream
-        (active_stream != blocking_stream)) {
-        // Get the last valid command
-        waitForStream(active_stream);
+    // If any stream in current thread is capturing in ThreadLocal mode
+    if (!hip::tls.capture_streams_.empty()) {
+      for (auto stream : hip::tls.capture_streams_) {
+        stream->SetCaptureStatus(hipStreamCaptureStatusInvalidated);
       }
+      return true;
     }
+    return false;
+  } else if (s != nullptr && s->GetCaptureStatus() == hipStreamCaptureStatusActive) {
+    s->SetCaptureStatus(hipStreamCaptureStatusInvalidated);
+    return true;
+  } else if (s != nullptr && s->GetCaptureStatus() == hipStreamCaptureStatusInvalidated) {
+    return true;
   }
-
-  // Check if we have to wait anything
-  if (eventWaitList.size() > 0 || submitMarker) {
-    amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList);
-    if (command != nullptr) {
-      command->enqueue();
-      command->release();
-    }
-  }
-
-  // Release all active commands. It's safe after the marker was enqueued
-  for (const auto& it : eventWaitList) {
-    it->release();
-  }
+  return false;
 }
 
 // ================================================================================================
@@ -565,7 +463,7 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig
         // If stream is capturing but event is not recorded on event's stream.
         return hipErrorStreamCaptureIsolation;
       }
-      if (eventStream->DeviceId() == waitStream->DeviceId()) {
+      if ((waitStream != nullptr) && (eventStream->DeviceId() == waitStream->DeviceId())) {
         eventStream->GetDevice()->AddSafeStream(eventStream, waitStream);
       }
     }
diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp
index 7173691e4..0b92c8696 100644
--- a/hipamd/src/hip_table_interface.cpp
+++ b/hipamd/src/hip_table_interface.cpp
@@ -1743,3 +1743,20 @@ hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
 hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr) {
   return hip::GetHipDispatchTable()->hipGetFuncBySymbol_fn(functionPtr, symbolPtr);
 }
+hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                   const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx) {
+  return hip::GetHipDispatchTable()->hipDrvGraphExecMemsetNodeSetParams_fn(hGraphExec, hNode,
+                                   memsetParams, ctx);
+}
+hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                  const hipGraphNode_t* dependencies, size_t numDependencies,
+                                  hipDeviceptr_t dptr) {
+  return hip::GetHipDispatchTable()->hipDrvGraphAddMemFreeNode_fn(phGraphNode, hGraph,
+                                  dependencies, numDependencies,
+                                  dptr);
+}
+hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                   const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
+  return hip::GetHipDispatchTable()->hipDrvGraphExecMemcpyNodeSetParams_fn(hGraphExec, hNode,
+                                   copyParams, ctx);
+}
diff --git a/hipamd/src/hip_vm.cpp b/hipamd/src/hip_vm.cpp
index f8dd197c1..79bd1230a 100644
--- a/hipamd/src/hip_vm.cpp
+++ b/hipamd/src/hip_vm.cpp
@@ -56,7 +56,7 @@ hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void*
 
   const auto& dev_info = g_devices[0]->devices()[0]->info();
   if (size == 0 || ((size % dev_info.virtualMemAllocGranularity_) != 0)
-      || ((alignment % dev_info.virtualMemAllocGranularity_) != 0)) {
+      || ((alignment & (alignment - 1)) != 0)) {
     HIP_RETURN(hipErrorMemoryAllocation);
   }
 
@@ -86,8 +86,8 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
     HIP_RETURN(hipErrorInvalidValue);
   }
 
-  // Currently only support non-IPC allocations
-  if (prop->requestedHandleType != hipMemHandleTypeNone) {
+  if (prop->requestedHandleType != hipMemHandleTypeNone
+      && prop->requestedHandleType != hipMemHandleTypePosixFileDescriptor) {
     HIP_RETURN(hipErrorNotSupported);
   }
 
@@ -139,11 +139,29 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle,
                                          unsigned long long flags) {
   HIP_INIT_API(hipMemExportToShareableHandle, shareableHandle, handle, handleType, flags);
 
-  if (flags != 0 || handle == nullptr || shareableHandle == nullptr) {
+  if (flags != 0 || handle == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
-  HIP_RETURN(hipErrorNotSupported);
+  hip::GenericAllocation* ga = reinterpret_cast<hip::GenericAllocation*>(handle);
+  if (ga == nullptr) {
+    LogError("Generic Allocation is nullptr");
+    HIP_RETURN(hipErrorNotInitialized);
+  }
+
+  if (ga->GetProperties().requestedHandleType != handleType) {
+    LogPrintfError("HandleType mismatch memoryHandleType: %d, requestedHandleType: %d",
+                    ga->GetProperties().requestedHandleType, handleType);
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (!ga->asAmdMemory().getContext().devices()[0]->ExportShareableVMMHandle(
+        ga->asAmdMemory().getUserData().hsa_handle, flags, shareableHandle)) {
+    LogPrintfError("Exporting Handle failed with flags: %d", flags);
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(hipSuccess);
 }
 
 hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr) {
@@ -205,7 +223,33 @@ hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* hand
     HIP_RETURN(hipErrorInvalidValue);
   }
 
-  HIP_RETURN(hipErrorNotSupported);
+  amd::Device* device = hip::getCurrentDevice()->devices()[0];
+  amd::Memory* phys_mem_obj = new (device->context()) amd::Buffer(device->context(),
+                                ROCCLR_MEM_PHYMEM | ROCCLR_MEM_INTERPROCESS, 0, osHandle);
+
+  if (phys_mem_obj == nullptr) {
+    LogError("failed to new a va range curr_mem_obj object!");
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (!phys_mem_obj->create(nullptr, false)) {
+    LogError("failed to create a va range mem object");
+    phys_mem_obj->release();
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  hipMemAllocationProp prop {};
+  prop.type = hipMemAllocationTypePinned;
+  prop.location.type = hipMemLocationTypeDevice;
+  prop.location.id = hip::getCurrentDevice()->deviceId();
+
+  phys_mem_obj->getUserData().deviceId = hip::getCurrentDevice()->deviceId();
+  phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, 0, prop);
+  *handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(phys_mem_obj->getUserData().data);
+
+  amd::MemObjMap::RemoveMemObj(phys_mem_obj->getSvmPtr());
+
+  HIP_RETURN(hipSuccess);
 }
 
 hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocationHandle_t handle,
@@ -287,15 +331,10 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc,
   }
 
   for (size_t desc_idx = 0; desc_idx < count; ++desc_idx) {
-    
     if (desc[desc_idx].location.id >= g_devices.size()) {
       HIP_RETURN(hipErrorInvalidValue)
     }
 
-    if (desc[desc_idx].flags == hipMemAccessFlagsProtRead) {
-      HIP_RETURN(hipErrorInvalidValue)
-    }
-
     auto& dev = g_devices[desc[desc_idx].location.id];
     amd::Device::VmmAccess access_flags = static_cast<amd::Device::VmmAccess>(desc[desc_idx].flags);
 
@@ -314,12 +353,12 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
-  amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(ptr);
-  if (vaddr_mem_obj == nullptr && vaddr_mem_obj->getSize() != size) {
+  amd::Memory* vaddr_sub_obj = amd::MemObjMap::FindMemObj(ptr);
+  if (vaddr_sub_obj == nullptr && vaddr_sub_obj->getSize() != size) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
-  amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
+  amd::Memory* phys_mem_obj = vaddr_sub_obj->getUserData().phys_mem_obj;
   if (phys_mem_obj == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp
index 9f902c071..9079d8402 100644
--- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp
+++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp
@@ -441,39 +441,6 @@ bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
   return true;
 }
 
-bool UnbundleBitCode(const std::vector<char>& bundled_llvm_bitcode, const std::string& isa,
-                     size_t& co_offset, size_t& co_size) {
-  std::string magic(bundled_llvm_bitcode.begin(),
-                    bundled_llvm_bitcode.begin() + bundle_magic_string_size);
-  if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) {
-    // Handle case where the whole file is unbundled
-    return true;
-  }
-
-  std::string bundled_llvm_bitcode_s(bundled_llvm_bitcode.begin(),
-                                     bundled_llvm_bitcode.begin() + bundled_llvm_bitcode.size());
-  const void* data = reinterpret_cast<const void*>(bundled_llvm_bitcode_s.c_str());
-  const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
-  const auto* desc = &obheader->desc[0];
-  for (uint64_t idx = 0; idx < obheader->numOfCodeObjects; ++idx,
-                desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
-                    reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
-                    desc->bundleEntryIdSize)) {
-    const void* image =
-        reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
-    const size_t image_size = desc->size;
-    std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
-
-    // Check if the device id and code object id are compatible
-    if (isCodeObjectCompatibleWithDevice(bundleEntryId, isa)) {
-      co_offset = (reinterpret_cast<uintptr_t>(image) - reinterpret_cast<uintptr_t>(data));
-      co_size = image_size;
-      break;
-    }
-  }
-  return true;
-}
-
 bool addCodeObjData(amd_comgr_data_set_t& input, const std::vector<char>& source,
                     const std::string& name, const amd_comgr_data_kind_t type) {
   amd_comgr_data_t data;
diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp
index 05e1c013d..aaa428ac7 100644
--- a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp
+++ b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp
@@ -31,8 +31,6 @@ THE SOFTWARE.
 
 namespace hiprtc {
 namespace helpers {
-bool UnbundleBitCode(const std::vector<char>& bundled_bit_code, const std::string& isa,
-                     size_t& co_offset, size_t& co_size);
 bool addCodeObjData(amd_comgr_data_set_t& input, const std::vector<char>& source,
                     const std::string& name, const amd_comgr_data_kind_t type);
 bool extractBuildLog(amd_comgr_data_set_t dataSet, std::string& buildLog);
diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp
index de597272a..9b88aa034 100644
--- a/hipamd/src/hiprtc/hiprtcInternal.cpp
+++ b/hipamd/src/hiprtc/hiprtcInternal.cpp
@@ -545,8 +545,7 @@ amd_comgr_data_kind_t RTCLinkProgram::GetCOMGRDataKind(hiprtcJITInputType input_
       data_kind = AMD_COMGR_DATA_KIND_BC;
       break;
     case HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE:
-      data_kind =
-          HIPRTC_USE_RUNTIME_UNBUNDLER ? AMD_COMGR_DATA_KIND_BC : AMD_COMGR_DATA_KIND_BC_BUNDLE;
+      data_kind = AMD_COMGR_DATA_KIND_BC_BUNDLE;
       break;
     case HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE:
       data_kind = AMD_COMGR_DATA_KIND_AR_BUNDLE;
@@ -561,32 +560,13 @@ amd_comgr_data_kind_t RTCLinkProgram::GetCOMGRDataKind(hiprtcJITInputType input_
 
 bool RTCLinkProgram::AddLinkerDataImpl(std::vector<char>& link_data, hiprtcJITInputType input_type,
                                        std::string& link_file_name) {
-  std::vector<char> llvm_bitcode;
-  // If this is bundled bitcode then unbundle this.
-  if (HIPRTC_USE_RUNTIME_UNBUNDLER && input_type == HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE) {
-    if (!findIsa()) {
-      return false;
-    }
-
-    size_t co_offset = 0;
-    size_t co_size = 0;
-    if (!UnbundleBitCode(link_data, isa_, co_offset, co_size)) {
-      LogError("Error in hiprtc: unable to unbundle the llvm bitcode");
-      return false;
-    }
-
-    llvm_bitcode.assign(link_data.begin() + co_offset, link_data.begin() + co_offset + co_size);
-  } else {
-    llvm_bitcode.assign(link_data.begin(), link_data.end());
-  }
-
   amd_comgr_data_kind_t data_kind;
   if ((data_kind = GetCOMGRDataKind(input_type)) == AMD_COMGR_DATA_KIND_UNDEF) {
     LogError("Cannot find the correct COMGR data kind");
     return false;
   }
 
-  if (!addCodeObjData(link_input_, llvm_bitcode, link_file_name, data_kind)) {
+  if (!addCodeObjData(link_input_, link_data, link_file_name, data_kind)) {
     LogError("Error in hiprtc: unable to add linked code object");
     return false;
   }
diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt
index 523eb2b5c..0223ffd0f 100644
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@@ -15,9 +15,10 @@ set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory")
 include(GNUInstallDirs)
 
 option(BUILD_TESTS "Enable building OpenCL tests" OFF)
-option(BUILD_ICD "Enable building OpenCL ICD Loader" OFF)
+option(BUILD_ICD "Enable building OpenCL ICD Loader" ON)
 option(EMU_ENV "Enable building for emulation environment" OFF)
 option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorganization backward compatibility" OFF)
+option(ENABLE_ADDRESS_SANITIZER "Option to enable ASAN build" OFF)
 
 # Add flags to generate PDB files with full symbolic information
 if(MSVC)
@@ -25,11 +26,17 @@ if(MSVC)
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG:FULL")
 endif()
 
-set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_LIST_DIR}/khronos/headers/opencl2.2" CACHE PATH "")
-
-###--- Packaging ------------------------------------------------------------###
+# Address sanitizer options
+if(ENABLE_ADDRESS_SANITIZER)
+  message(STATUS "Building ocltst tests with Address Sanitizer options")
+  set(CMAKE_C_COMPILER clang)
+  set(CMAKE_CXX_COMPILER clang++)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan -g -gz")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan -g -gz")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags -fuse-ld=lld -fsanitize=address -shared-libasan -g -gz -Wl,--build-id=sha1 -L${ROCM_PATH}/lib/asan")
+endif()
 
-# DEV package
+set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_LIST_DIR}/khronos/headers/opencl2.2" CACHE PATH "")
 if(BUILD_ICD)
   add_subdirectory(khronos/icd)
 else()
@@ -42,6 +49,19 @@ if(BUILD_TESTS)
   add_subdirectory(tests/ocltst)
 endif()
 
+###--- Packaging ------------------------------------------------------------###
+
+# DEV package
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/khronos/headers/opencl2.2/CL"
+        DESTINATION include
+        COMPONENT DEV
+        PATTERN cl_d3d10.h EXCLUDE
+        PATTERN cl_d3d11.h EXCLUDE
+        PATTERN cl_dx9_media_sharing.h EXCLUDE
+        PATTERN cl_egl.h EXCLUDE
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
+        )
+
 #############################
 # Packaging steps
 #############################
diff --git a/opencl/amdocl/cl_gl.cpp b/opencl/amdocl/cl_gl.cpp
index a6a2851d1..7ccfb0c00 100644
--- a/opencl/amdocl/cl_gl.cpp
+++ b/opencl/amdocl/cl_gl.cpp
@@ -769,7 +769,7 @@ RUNTIME_ENTRY_RET(cl_event, clCreateEventFromGLsyncKHR,
   // initially set the status of fence as queued
   clglEvent->setStatus(CL_SUBMITTED);
   // store GLsync id of the fence in event in order to associate them together
-  clglEvent->setData(clGLsync);
+  clglEvent->data().emplace_back(clGLsync);
   amd::Event* evt = clglEvent;
   evt->retain();
   *not_null(errcode_ret) = CL_SUCCESS;
diff --git a/opencl/amdocl/cl_icd.cpp b/opencl/amdocl/cl_icd.cpp
index 4fce08b06..1d9638f62 100644
--- a/opencl/amdocl/cl_icd.cpp
+++ b/opencl/amdocl/cl_icd.cpp
@@ -132,134 +132,6 @@ cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {
      clSetProgramReleaseCallback,
      clSetProgramSpecializationConstant }};
 
-#if defined(_WIN32)
-#include <Shlwapi.h>
-
-#pragma comment(lib, "shlwapi.lib")
-
-static bool ShouldLoadPlatform() {
-  // Get the OpenCL ICD registry values
-  HKEY platformsKey = NULL;
-  if (RegOpenKeyExA(HKEY_LOCAL_MACHINE, "SOFTWARE\\Khronos\\OpenCL\\Vendors", 0, KEY_READ,
-                    &platformsKey) != ERROR_SUCCESS)
-    return true;
-
-  std::vector<std::string> registryValues;
-  DWORD dwIndex = 0;
-  while (true) {
-    char cszLibraryName[1024] = {0};
-    DWORD dwLibraryNameSize = sizeof(cszLibraryName);
-    DWORD dwLibraryNameType = 0;
-    DWORD dwValue = 0;
-    DWORD dwValueSize = sizeof(dwValue);
-
-    if (RegEnumValueA(platformsKey, dwIndex++, cszLibraryName, &dwLibraryNameSize, NULL,
-                      &dwLibraryNameType, (LPBYTE)&dwValue, &dwValueSize) != ERROR_SUCCESS)
-      break;
-    // Require that the value be a DWORD and equal zero
-    if (dwLibraryNameType != REG_DWORD || dwValue != 0) {
-      continue;
-    }
-    registryValues.push_back(cszLibraryName);
-  }
-  RegCloseKey(platformsKey);
-
-  HMODULE hm = NULL;
-  if (!GetModuleHandleExA(
-          GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-          (LPCSTR)&ShouldLoadPlatform, &hm))
-    return true;
-
-  char cszDllPath[1024] = {0};
-  if (!GetModuleFileNameA(hm, cszDllPath, sizeof(cszDllPath))) return true;
-
-  // If we are loaded from the DriverStore, then there should be a registry
-  // value matching our current module absolute path.
-  if (std::find(registryValues.begin(), registryValues.end(), cszDllPath) == registryValues.end())
-    return true;
-
-  LPSTR cszFileName;
-  char buffer[1024] = {0};
-  if (!GetFullPathNameA(cszDllPath, sizeof(buffer), buffer, &cszFileName)) return true;
-
-  // We found an absolute path in the registry that matched this DLL, now
-  // check if there is also an entry with the same filename.
-  if (std::find(registryValues.begin(), registryValues.end(), cszFileName) == registryValues.end())
-    return true;
-
-  // Lastly, check if there is a DLL with the same name in the System folder.
-  char cszSystemPath[1024] = {0};
-#if defined(ATI_BITS_32)
-  if (!GetSystemWow64DirectoryA(cszSystemPath, sizeof(cszSystemPath)))
-#endif  // defined(ATI_BITS_32)
-    if (!GetSystemDirectoryA(cszSystemPath, sizeof(cszSystemPath))) return true;
-
-  std::string systemDllPath;
-  systemDllPath.append(cszSystemPath).append("\\").append(cszFileName);
-  if (!PathFileExistsA(systemDllPath.c_str())) {
-    return true;
-  }
-
-  // If we get here, then all 3 conditions are true:
-  // - An entry in the registry with an absolute path matches the current DLL
-  // - An entry in the registry with a relative path matches the current DLL
-  // - A DLL with the same name was found in the system directory
-  //
-  // We should not load this platform!
-
-  return false;
-}
-
-#else
-
-#include <dlfcn.h>
-
-// If there is only one platform, load it.
-// If there is more than one platform, only load platforms that have visible devices
-// If all platforms have no devices available, only load the PAL platform
-static bool ShouldLoadPlatform() {
-  bool shouldLoad = true;
-
-  if (!amd::Runtime::initialized()) {
-    amd::Runtime::init();
-  }
-  const int numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false);
-
-  void *otherPlatform = nullptr;
-  if (amd::IS_LEGACY) {
-    otherPlatform = dlopen("libamdocl64.so", RTLD_LAZY);
-    if (otherPlatform != nullptr) { // Present platform exists
-      shouldLoad = numDevices > 0;
-    }
-  } else {
-    otherPlatform = dlopen("libamdocl-orca64.so", RTLD_LAZY);
-    if (otherPlatform != nullptr) { // Legacy platform exists
-      // gcc4.8 doesn't support casting void* to a function pointer
-      // Work around this by creating a typedef untill we upgrade the compiler
-      typedef void*(*clGetFunctionAddress_t)(const char *);
-      typedef cl_int(*clIcdGetPlatformIDs_t)(cl_uint, cl_platform_id *, cl_uint *);
-
-      clGetFunctionAddress_t legacyGetFunctionAddress =
-        reinterpret_cast<clGetFunctionAddress_t>(dlsym(otherPlatform, "clGetExtensionFunctionAddress"));
-      clIcdGetPlatformIDs_t legacyGetPlatformIDs =
-        reinterpret_cast<clIcdGetPlatformIDs_t>(legacyGetFunctionAddress("clIcdGetPlatformIDsKHR"));
-
-      cl_uint numLegacyPlatforms = 0;
-      legacyGetPlatformIDs(0, nullptr, &numLegacyPlatforms);
-
-      shouldLoad = (numDevices > 0) || (numLegacyPlatforms == 0);
-    }
-  }
-
-  if (otherPlatform != nullptr) {
-    dlclose(otherPlatform);
-  }
-
-  return shouldLoad;
-}
-
-#endif // defined(_WIN32)
-
 CL_API_ENTRY cl_int CL_API_CALL clIcdGetPlatformIDsKHR(cl_uint num_entries,
                                                        cl_platform_id* platforms,
                                                        cl_uint* num_platforms) {
@@ -268,16 +140,6 @@ CL_API_ENTRY cl_int CL_API_CALL clIcdGetPlatformIDsKHR(cl_uint num_entries,
     return CL_INVALID_VALUE;
   }
 
-  static bool shouldLoad = true;
-
-  static std::once_flag initOnce;
-  std::call_once(initOnce, [](){ shouldLoad = ShouldLoadPlatform(); });
-
-  if (!shouldLoad) {
-    *not_null(num_platforms) = 0;
-    return CL_SUCCESS;
-  }
-
   if (!amd::Runtime::initialized()) {
     amd::Runtime::init();
   }
diff --git a/opencl/packaging/CMakeLists.txt b/opencl/packaging/CMakeLists.txt
index 3643f88e7..d9329c415 100644
--- a/opencl/packaging/CMakeLists.txt
+++ b/opencl/packaging/CMakeLists.txt
@@ -22,6 +22,12 @@ install(TARGETS amdocl DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan)
 install(FILES ${opencl_SOURCE_DIR}/LICENSE.txt DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary)
 install(FILES ${opencl_SOURCE_DIR}/LICENSE.txt DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan)
 
+install(DIRECTORY ${opencl_SOURCE_DIR}/khronos/headers/opencl2.2/CL
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT dev
+        USE_SOURCE_PERMISSIONS
+        PATTERN cl_d3d10.h EXCLUDE
+        PATTERN cl_d3d11.h EXCLUDE
+        PATTERN cl_dx9_media_sharing.h EXCLUDE )
 
 if(BUILD_ICD)
   install(TARGETS OpenCL DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT icd )
diff --git a/opencl/tests/ocltst/env/CMakeLists.txt b/opencl/tests/ocltst/env/CMakeLists.txt
index 13dec123f..308acc7d6 100644
--- a/opencl/tests/ocltst/env/CMakeLists.txt
+++ b/opencl/tests/ocltst/env/CMakeLists.txt
@@ -34,11 +34,10 @@ target_include_directories(ocltst
     PRIVATE
         $<TARGET_PROPERTY:Common,INTERFACE_INCLUDE_DIRECTORIES>)
 
-target_link_libraries(ocltst PRIVATE OpenCL::OpenCL ${CMAKE_DL_LIBS})
-
-if(NOT WIN32)
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-endif()
+target_link_libraries(ocltst
+    PRIVATE
+        OpenCL
+        )
 
 set_target_properties(ocltst PROPERTIES INSTALL_RPATH "$ORIGIN")
 
diff --git a/opencl/tests/ocltst/module/gl/CMakeLists.txt b/opencl/tests/ocltst/module/gl/CMakeLists.txt
index 41dffa15b..115879cf1 100644
--- a/opencl/tests/ocltst/module/gl/CMakeLists.txt
+++ b/opencl/tests/ocltst/module/gl/CMakeLists.txt
@@ -45,7 +45,9 @@ target_include_directories(oclgl
     PRIVATE
         $<TARGET_PROPERTY:Common,INTERFACE_INCLUDE_DIRECTORIES>)
 
-target_link_libraries(oclgl PRIVATE
+target_link_libraries(oclgl
+    PRIVATE
+        OpenCL
         ${GLEW_LIBRARIES}
         ${OPENGL_LIBRARIES})
 
diff --git a/opencl/tests/ocltst/module/perf/CMakeLists.txt b/opencl/tests/ocltst/module/perf/CMakeLists.txt
index e0134c922..b9780283c 100644
--- a/opencl/tests/ocltst/module/perf/CMakeLists.txt
+++ b/opencl/tests/ocltst/module/perf/CMakeLists.txt
@@ -95,10 +95,9 @@ target_include_directories(oclperf
     PRIVATE
         $<TARGET_PROPERTY:Common,INTERFACE_INCLUDE_DIRECTORIES>)
 
-target_link_libraries(oclperf PRIVATE OpenCL::OpenCL ${CMAKE_DL_LIBS})
-if(NOT WIN32)
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-endif()
+target_link_libraries(oclperf
+    PRIVATE
+        OpenCL)
 
 add_custom_command(
     TARGET oclperf POST_BUILD
diff --git a/opencl/tests/ocltst/module/runtime/CMakeLists.txt b/opencl/tests/ocltst/module/runtime/CMakeLists.txt
index 1ffd58ad0..0b5de9417 100644
--- a/opencl/tests/ocltst/module/runtime/CMakeLists.txt
+++ b/opencl/tests/ocltst/module/runtime/CMakeLists.txt
@@ -68,10 +68,9 @@ target_include_directories(oclruntime
     PRIVATE
         $<TARGET_PROPERTY:Common,INTERFACE_INCLUDE_DIRECTORIES>)
 
-target_link_libraries(oclruntime PRIVATE OpenCL::OpenCL ${CMAKE_DL_LIBS})
-if(NOT WIN32)
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
-endif()
+target_link_libraries(oclruntime
+    PRIVATE
+        OpenCL)
 
 add_custom_command(
     TARGET oclruntime POST_BUILD
diff --git a/opencl/tools/clinfo/CMakeLists.txt b/opencl/tools/clinfo/CMakeLists.txt
index da795267d..217f18225 100644
--- a/opencl/tools/clinfo/CMakeLists.txt
+++ b/opencl/tools/clinfo/CMakeLists.txt
@@ -4,7 +4,7 @@ target_compile_definitions(clinfo PRIVATE CL_TARGET_OPENCL_VERSION=220 HAVE_CL2_
 
 target_include_directories(clinfo PRIVATE ${OPENCL_ICD_LOADER_HEADERS_DIR})
 
-target_link_libraries(clinfo OpenCL::OpenCL)
+target_link_libraries(clinfo OpenCL)
 
 INSTALL(TARGETS clinfo
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/opencl/tools/cltrace/CMakeLists.txt b/opencl/tools/cltrace/CMakeLists.txt
index b162b4fb2..f8f340b8f 100644
--- a/opencl/tools/cltrace/CMakeLists.txt
+++ b/opencl/tools/cltrace/CMakeLists.txt
@@ -7,6 +7,8 @@ else()
   set_target_properties(cltrace PROPERTIES LINK_DEPENDS "${CMAKE_CURRENT_LIST_DIR}/cltrace.map")
 endif()
 
+target_compile_definitions(cltrace PRIVATE CL_TARGET_OPENCL_VERSION=220)
+
 target_include_directories(cltrace PRIVATE ${CMAKE_SOURCE_DIR}/opencl ${OPENCL_ICD_LOADER_HEADERS_DIR} ${ROCCLR_INCLUDE_DIR})
 
 INSTALL(TARGETS cltrace
diff --git a/rocclr/cmake/ROCclrLC.cmake b/rocclr/cmake/ROCclrLC.cmake
index b41fcea8a..72f406210 100644
--- a/rocclr/cmake/ROCclrLC.cmake
+++ b/rocclr/cmake/ROCclrLC.cmake
@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-find_package(amd_comgr REQUIRED CONFIG
+find_package(amd_comgr 2.8 REQUIRED CONFIG
   PATHS
     /opt/rocm/
     ${ROCM_INSTALL_PATH}
diff --git a/rocclr/device/blit.cpp b/rocclr/device/blit.cpp
index 33b04c598..140ab00f5 100644
--- a/rocclr/device/blit.cpp
+++ b/rocclr/device/blit.cpp
@@ -41,7 +41,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
   }
 
   // Copy memory
-  amd::Os::fastMemcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
+  std::memcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
 
   // Unmap device memory
   srcMemory.cpuUnmap(vDev_);
@@ -69,8 +69,8 @@ bool HostBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
       dstOffset = hostRect.offset(0, y, z);
 
       // Copy memory line by line
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffset),
-                          (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
+      std::memcpy((reinterpret_cast<address>(dstHost) + dstOffset),
+                  (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
     }
   }
 
@@ -133,8 +133,8 @@ bool HostBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
     // Copy memory line by line
     for (size_t row = 0; row < size[1]; ++row) {
       // Copy memory
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dstHost) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);
 
       srcOffs += srcRowPitch;
       dstOffs += rowPitch;
@@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory
   }
 
   // Copy memory
-  amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
+  std::memcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
 
   // Unmap the device memory
   dstMemory.cpuUnmap(vDev_);
@@ -191,8 +191,8 @@ bool HostBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMe
       dstOffset = bufRect.offset(0, y, z);
 
       // Copy memory line by line
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
-                          (reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffset),
+                  (reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
     }
   }
 
@@ -258,8 +258,8 @@ bool HostBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
     // Copy memory line by line
     for (size_t row = 0; row < size[1]; ++row) {
       // Copy memory
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);
 
       dstOffs += dstRowPitch;
       srcOffs += rowPitch;
@@ -293,8 +293,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM
   }
 
   // Straight forward buffer copy
-  amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
-                      (reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);
+  std::memcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
+              (reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);
 
   // Unmap source and destination memory
   dstMemory.cpuUnmap(vDev_);
@@ -329,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
       size_t dstOffset = dstRect.offset(0, y, z);
 
       // Copy memory line by line
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
-                          (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffset),
+                  (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
     }
   }
 
@@ -392,8 +392,8 @@ bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memor
 
     // Copy memory line by line
     for (size_t rows = 0; rows < size[1]; ++rows) {
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);
 
       srcOffs += srcRowPitch;
       dstOffs += copySize;
@@ -458,8 +458,8 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor
 
     // Copy memory line by line
     for (size_t rows = 0; rows < size[1]; ++rows) {
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);
 
       srcOffs += copySize;
       dstOffs += dstRowPitch;
@@ -544,8 +544,8 @@ bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMe
 
     // Copy memory line by line
     for (size_t rows = 0; rows < size[1]; ++rows) {
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);
 
       srcOffs += srcRowPitch;
       dstOffs += dstRowPitch;
diff --git a/rocclr/device/blitcl.cpp b/rocclr/device/blitcl.cpp
index 6c899c49c..921f4570b 100644
--- a/rocclr/device/blitcl.cpp
+++ b/rocclr/device/blitcl.cpp
@@ -48,12 +48,12 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
   extern void __ockl_gws_init(uint nwm1, uint rid);
 
   __kernel void __amd_rocclr_fillBufferAligned(
-      __global uchar* bufUChar, __global ushort* bufUShort, __global uint* bufUInt,
-      __global ulong* bufULong, __global ulong2* bufULong2, __constant uchar* pattern,
-      uint pattern_size, ulong offset, ulong end_ptr, uint next_chunk) {
+    __global void* buf, __constant uchar* pattern,
+    uint pattern_size, uint alignment, ulong end_ptr, uint next_chunk) {
     int id = get_global_id(0);
-    long cur_id = offset + id * pattern_size;
-    if (bufULong2) {
+    long cur_id = id * pattern_size;
+    if (alignment == sizeof(ulong2)) {
+      __global ulong2* bufULong2 = (__global ulong2*)buf;
       __global ulong2* element = &bufULong2[cur_id];
       __constant ulong2* pt = (__constant ulong2*)pattern;
       while ((ulong)element < end_ptr) {
@@ -62,7 +62,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
         }
         element += next_chunk;
       }
-    } else if (bufULong) {
+    } else if (alignment == sizeof(ulong)) {
+      __global ulong* bufULong = (__global ulong*)buf;
       __global ulong* element = &bufULong[cur_id];
       __constant ulong* pt = (__constant ulong*)pattern;
       while ((ulong)element < end_ptr) {
@@ -71,7 +72,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
         }
         element += next_chunk;
       }
-    } else if (bufUInt) {
+    } else if (alignment == sizeof(uint)) {
+      __global uint* bufUInt = (__global uint*)buf;
       __global uint* element = &bufUInt[cur_id];
       __constant uint* pt = (__constant uint*)pattern;
       while ((ulong)element < end_ptr) {
@@ -80,7 +82,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
         }
         element += next_chunk;
       }
-    } else if (bufUShort) {
+    } else if (alignment == sizeof(ushort)) {
+      __global ushort* bufUShort = (__global ushort*)buf;
       __global ushort* element = &bufUShort[cur_id];
       __constant ushort* pt = (__constant ushort*)pattern;
       while ((ulong)element < end_ptr) {
@@ -90,6 +93,7 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
         element += next_chunk;
       }
     } else {
+      __global uchar* bufUChar = (__global uchar*)buf;
       __global uchar* element = &bufUChar[cur_id];
       while ((ulong)element < end_ptr) {
         for (uint i = 0; i < pattern_size; ++i) {
@@ -115,15 +119,12 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
                               pitch);
   }
 
-  __kernel void __amd_rocclr_copyBuffer(__global uchar* srcI, __global uchar* dstI,
-                                          ulong srcOrigin, ulong dstOrigin, ulong size, uint remainder,
+  __kernel void __amd_rocclr_copyBuffer(__global uchar* src, __global uchar* dst,
+                                          ulong size, uint remainder,
                                           uint aligned_size, ulong end_ptr, uint next_chunk) {
     ulong id = get_global_id(0);
     ulong id_remainder = id;
 
-    __global uchar* src = srcI + srcOrigin;
-    __global uchar* dst = dstI + dstOrigin;
-
     if (aligned_size == sizeof(ulong2)) {
       __global ulong2* srcD = (__global ulong2*)(src);
       __global ulong2* dstD = (__global ulong2*)(dst);
diff --git a/rocclr/device/comgrctx.cpp b/rocclr/device/comgrctx.cpp
index 1a8afe60a..5b825d0db 100644
--- a/rocclr/device/comgrctx.cpp
+++ b/rocclr/device/comgrctx.cpp
@@ -119,6 +119,7 @@ bool Comgr::LoadLib(bool is_versioned) {
   GET_COMGR_SYMBOL(amd_comgr_get_mangled_name)
   GET_COMGR_SYMBOL(amd_comgr_populate_name_expression_map)
   GET_COMGR_SYMBOL(amd_comgr_map_name_expression_to_symbol_name)
+  GET_COMGR_SYMBOL(amd_comgr_action_info_set_bundle_entry_ids)
   is_ready_ = true;
   return true;
 }
diff --git a/rocclr/device/comgrctx.hpp b/rocclr/device/comgrctx.hpp
index d8dd5e17f..03ea2eeee 100644
--- a/rocclr/device/comgrctx.hpp
+++ b/rocclr/device/comgrctx.hpp
@@ -76,6 +76,7 @@ typedef amd_comgr_status_t (*t_amd_comgr_populate_mangled_names)(amd_comgr_data_
 typedef amd_comgr_status_t (*t_amd_comgr_get_mangled_name)(amd_comgr_data_t data, size_t index, size_t *size, char *mangled_name);
 typedef amd_comgr_status_t (*t_amd_comgr_populate_name_expression_map)(amd_comgr_data_t data, size_t *count);
 typedef amd_comgr_status_t (*t_amd_comgr_map_name_expression_to_symbol_name)(amd_comgr_data_t data, size_t *size, char *name_expression, char* symbol_name);
+typedef amd_comgr_status_t (*t_amd_comgr_action_info_set_bundle_entry_ids)(amd_comgr_action_info_t action_info, const char* bundle_entry_ids[], size_t count);
 
 struct ComgrEntryPoints {
   void* handle;
@@ -129,6 +130,7 @@ struct ComgrEntryPoints {
   t_amd_comgr_get_mangled_name          amd_comgr_get_mangled_name;
   t_amd_comgr_populate_name_expression_map  amd_comgr_populate_name_expression_map;
   t_amd_comgr_map_name_expression_to_symbol_name amd_comgr_map_name_expression_to_symbol_name;
+  t_amd_comgr_action_info_set_bundle_entry_ids amd_comgr_action_info_set_bundle_entry_ids;
 };
 
 #ifdef COMGR_DYN_DLL
@@ -310,7 +312,11 @@ class Comgr : public amd::AllStatic {
   static amd_comgr_status_t map_name_expression_to_symbol_name(amd_comgr_data_t data, size_t *size, char *name_expression, char* symbol_name) {
     return COMGR_DYN(amd_comgr_map_name_expression_to_symbol_name)(data, size, name_expression, symbol_name);
   }
-
+  static amd_comgr_status_t action_info_set_bundle_entry_ids(amd_comgr_action_info_t action_info,
+           const char* bundle_entry_ids[], size_t count) {
+    return COMGR_DYN(amd_comgr_action_info_set_bundle_entry_ids)(action_info, bundle_entry_ids,
+                     count);
+  }
 
  private:
   static ComgrEntryPoints cep_;
diff --git a/rocclr/device/devhostcall.cpp b/rocclr/device/devhostcall.cpp
index 1729e5ebf..147d61253 100644
--- a/rocclr/device/devhostcall.cpp
+++ b/rocclr/device/devhostcall.cpp
@@ -273,12 +273,34 @@ HostcallListener* hostcallListener = nullptr;
 amd::Monitor listenerLock("Hostcall listener lock");
 constexpr static uint64_t kTimeoutFloor = K * K * 4;
 constexpr static uint64_t kTimeoutCeil = K * K * 16;
-
+static struct Init {
+  enum class State {
+    kDefault = 0,
+    kInit,
+    kDestroy,
+    kExit
+  };
+  volatile State state = State::kDefault;
+  ~Init() {
+    if (state == State::kInit) {
+      state = State::kDestroy;
+      // @note: Under Linux thread destruction can be delayed and
+      // ROCR may crash in a wait for event occasionally. Hence, runtime needs
+      // an early exit. The logic isn't required for Windows.
+      while (IS_LINUX && (state == State::kDestroy)) {}
+    }
+  }
+} kHostThreadActive;
 void HostcallListener::consumePackets() {
   uint64_t timeout = kTimeoutFloor;
   uint64_t signal_value = SIGNAL_INIT;
+  kHostThreadActive.state = Init::State::kInit;
   while (true) {
     while (true) {
+      if (kHostThreadActive.state == Init::State::kDestroy) {
+        kHostThreadActive.state = Init::State::kExit;
+        return;
+      }
       uint64_t new_value = doorbell_->Wait(signal_value, device::Signal::Condition::Ne, timeout);
       if (new_value != signal_value) {
         signal_value = new_value;
@@ -312,7 +334,7 @@ void HostcallListener::terminate() {
   if (!amd::Os::isThreadAlive(thread_)) {
     return;
   }
-
+  kHostThreadActive.state = Init::State::kExit;
   doorbell_->Reset(SIGNAL_DONE);
 
   // FIXME_lmoriche: fix termination handshake
diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp
index 3202ca2fb..76a6cb716 100644
--- a/rocclr/device/device.cpp
+++ b/rocclr/device/device.cpp
@@ -221,6 +221,8 @@ std::pair<const Isa*, const Isa*> Isa::supportedIsas() {
     {"gfx1103",                "gfx1103",   true,  true,    11, 0,  3,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
     {"gfx1150",                "gfx1150",   true,  true,    11, 5,  0,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
     {"gfx1151",                "gfx1151",   true,  true,    11, 5,  1,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
+    {"gfx1200",                "gfx1200",   true,  true,    12, 0,  0,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
+    {"gfx1201",                "gfx1201",   true,  true,    12, 0,  1,    NONE,   NONE, 2,    32,   1,    256,    64 * Ki, 32},
   };
   return std::make_pair(std::begin(supportedIsas_), std::end(supportedIsas_));
 }
@@ -376,6 +378,118 @@ amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) {
   }
 }
 
+//==================================================================================================
+bool Device::ValidateVirtualAddressRange(amd::Memory* vaddr_base_obj, amd::Memory* vaddr_sub_obj) {
+
+  // Check if the start of the subbuffer is >= to base start.
+  if (vaddr_base_obj->getSvmPtr() > vaddr_sub_obj->getSvmPtr()) {
+    LogError("Sub buffer cannot start with addr lesser than base_start.");
+    return false;
+  }
+
+  // Check if the new size belongs to the vaddr_base_obj range.
+  address vaddr_base_end = reinterpret_cast<address>(vaddr_base_obj->getSvmPtr())
+                             + vaddr_base_obj->getSize();
+  address vaddr_sub_end = reinterpret_cast<address>(vaddr_sub_obj->getSvmPtr())
+                            + vaddr_sub_obj->getSize();
+
+  if (vaddr_sub_end > vaddr_base_end) {
+    LogError("Sub buffer memory end cannot be greater than base_end. Return nullptr");
+    return false;
+  }
+
+  return true;
+}
+
+//==================================================================================================
+amd::Memory* Device::CreateVirtualBuffer(amd::Context& device_context, void* vptr, size_t size,
+                                         int deviceId, bool parent, bool kForceAlloc) {
+
+  amd::Memory* vaddr_base_obj = nullptr;
+  amd::Memory* vaddr_sub_obj = nullptr;
+  constexpr bool kSysMemAlloc = false;
+  constexpr bool kSkipAlloc = false;
+
+  if (parent) {
+    vaddr_base_obj = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_VA_RANGE_AMD, size, vptr);
+    if (vaddr_base_obj == nullptr) {
+      LogError("failed to new a va range curr_mem_obj object!");
+      return nullptr;
+    }
+    // This curr_mem_obj->create() does not create an actual memory but stores the memory info
+    // with given vptr on ROCr backend.
+    if (!vaddr_base_obj->create(nullptr, kSysMemAlloc, kSkipAlloc, kForceAlloc)) {
+      LogError("failed to create a va range mem object");
+      vaddr_base_obj->release();
+      return nullptr;
+    }
+
+    amd::MemObjMap::AddVirtualMemObj(vaddr_base_obj->getSvmPtr(), vaddr_base_obj);
+  } else {
+    // If not parent, but sub-buffer/child, then validate the address range
+    vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vptr);
+    if (vaddr_base_obj == nullptr) {
+      LogPrintfError("Cannot find entry in VirtualMemObjMap: 0x%x \n", vptr);
+      return nullptr;
+    }
+    assert(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD);
+
+    size_t offset = (reinterpret_cast<address>(vptr)
+                     - reinterpret_cast<address>(vaddr_base_obj->getSvmPtr()));
+    vaddr_sub_obj = new (device_context) amd::Buffer(*vaddr_base_obj, CL_MEM_VA_RANGE_AMD, offset,
+                                                     size);
+
+    // This curr_mem_obj->create() does not create an actual memory but stores the memory info
+    // with given vptr on ROCr backend.
+    if (!vaddr_sub_obj->create(nullptr, kSysMemAlloc, kSkipAlloc, kForceAlloc)) {
+      LogError("failed to create a va range mem object");
+      vaddr_sub_obj->release();
+      return nullptr;
+    }
+
+    vaddr_sub_obj->getUserData().deviceId = deviceId;
+
+    if (!ValidateVirtualAddressRange(vaddr_base_obj, vaddr_sub_obj)) {
+      LogError("Validation failed on address range, returning nullptr");
+      return nullptr;
+    }
+  }
+
+  if (vptr != nullptr) {
+    // Assert to make sure that amd::Memory object has set the right ptr.
+    guarantee(vptr == (parent ? vaddr_base_obj->getSvmPtr() : vaddr_sub_obj->getSvmPtr()),
+                                 "amd::Memory object does not have the right ptr");
+  }
+
+  return parent ? vaddr_base_obj : vaddr_sub_obj;
+}
+
+//==================================================================================================
+bool Device::DestroyVirtualBuffer(amd::Memory* vaddr_mem_obj) {
+
+  // Argument nullptr check.
+  if (vaddr_mem_obj == nullptr || vaddr_mem_obj->getSvmPtr() == nullptr) {
+    LogPrintfError("Mem obj passed is nullptr, vaddr_mem_obj: %p \n", vaddr_mem_obj);
+    return false;
+  }
+
+  if (vaddr_mem_obj->parent() == nullptr) {
+    // If parent is nullptr, then vaddr_mem_obj is the parent.
+    amd::MemObjMap::RemoveVirtualMemObj(vaddr_mem_obj->getSvmPtr());
+    return true;
+  } else {
+    // If parent is not nullptr, this is the sub-buffer object.
+    amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vaddr_mem_obj->getSvmPtr());
+    if (vaddr_base_obj == nullptr) {
+      LogPrintfError("Cannot find mem obj for ptr: 0x%x", vaddr_mem_obj->getSvmPtr());
+      return false;
+    }
+    vaddr_base_obj->removeSubBuffer(vaddr_mem_obj);
+  }
+
+  return true;
+}
+
 void MemObjMap::UpdateAccess(amd::Device *peerDev) {
   if (peerDev == nullptr) {
     return;
@@ -455,6 +569,9 @@ bool Device::BlitProgram::create(amd::Device* device, const std::string& extraKe
   if (!GPU_DUMP_BLIT_KERNELS) {
     opt += " -fno-enable-dump";
   }
+  if (device->settings().kernel_arg_opt_) {
+    opt += " -Wb,-amdgpu-kernarg-preload-count=8 ";
+  }
   if ((retval = program_->build(devices, opt.c_str(), nullptr, nullptr, GPU_DUMP_BLIT_KERNELS))
       != CL_SUCCESS) {
     DevLogPrintfError("Build failed for Kernel: %s with error code %d\n",
@@ -775,11 +892,11 @@ bool Device::disableP2P(amd::Device* ptrDev) {
 }
 
 bool Device::UpdateStackSize(uint64_t stackSize) {
-  // Amount of space used by each wave is in units of 256 dwords. 
+  // Amount of space used by each wave is in units of 256 dwords.
   // As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12
-  // The field size supports a range of 0->(2M-256) dwords per wave64. 
+  // The field size supports a range of 0->(2M-256) dwords per wave64.
   // Per lane this works out to 131056 bytes or 128K - 16
-  uint64_t kStackSize = ((128 * Ki) - 16); 
+  uint64_t kStackSize = ((128 * Ki) - 16);
   if (stackSize > kStackSize) {
     return false;
   }
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index c73b2b527..1a2bb1816 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -651,6 +651,19 @@ struct Info : public amd::EmbeddedObject {
 //! Device settings
 class Settings : public amd::HeapObject {
  public:
+
+  enum KernelArgImpl {
+    HostKernelArgs = 0,       //!< Kernel Arguments are put into host memory
+    DeviceKernelArgs,         //!< Device memory kernel arguments with no memory
+                              //!< ordering workaround (e.g. XGMI)
+    DeviceKernelArgsReadback, //!< Device memory kernel arguments with kernel
+                              //!< argument readback workaround (works only in
+                              //!< ASICS >= MI200)
+    DeviceKernelArgsHDP       //!< Device memory kernel arguments with kernel
+                              //!< argument readback plus HDP flush workaround.
+                              //!< Works in all ASICS. Requires a valid hdp flush register
+  };
+
   uint64_t extensions_;  //!< Supported OCL extensions
   union {
     struct {
@@ -674,7 +687,9 @@ class Settings : public amd::HeapObject {
       uint fenceScopeAgent_ : 1;      //!< Enable fence scope agent in AQL dispatch packet
       uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
       uint gwsInitSupported_:1;       //!< Check if GWS is supported on this machine.
-      uint reserved_ : 10;
+      uint kernel_arg_opt_: 1;        //!< Enables kernel arg optimization for blit kernels
+      uint kernel_arg_impl_ : 2;      //!< Kernel argument implementation 
+      uint reserved_ : 7;
     };
     uint value_;
   };
@@ -1757,7 +1772,7 @@ class Device : public RuntimeObject {
   }
 
   virtual void* deviceLocalAlloc(size_t size, bool atomics = false,
-                                 bool pseudo_fine_grain = false) const {
+                                 bool pseudo_fine_grain = false, bool contiguous = false) const {
     ShouldNotCallThis();
     return NULL;
   }
@@ -1796,6 +1811,34 @@ class Device : public RuntimeObject {
    */
   virtual void svmFree(void* ptr) const = 0;
 
+  /**
+   * Validatates Virtual Address range between parent and sub-buffer.
+   *
+   * @param vaddr_base_obj Parent/base object of the virtual address.
+   * @param vaddr_sub_obj Sub Buffer object of the virtual address.
+   */
+  static bool ValidateVirtualAddressRange(amd::Memory* vaddr_base_obj, amd::Memory* vaddr_sub_obj);
+
+  /**
+   * Abstracts the Virtual Buffer creation and memobj/virtual memobj add/delete logic.
+   *
+   * @param device_context Context the virtual buffer should be created.
+   * @param vptr virtual ptr to store in the buffer object.
+   * @param size Size of the buffer
+   * @param deviceId deviceId
+   * @param parent base_obj or sub_obj
+   * @param ForceAlloc force_alloc
+   */
+  amd::Memory* CreateVirtualBuffer(Context& device_context, void* vptr, size_t size,
+                                           int deviceId, bool parent, bool kForceAlloc = false);
+
+  /**
+   * Deletes Virtual Buffer and creates memob
+   *
+   * @param vaddr_mem_obj amd::Memory object of parent/sub buffer.
+   */
+  bool DestroyVirtualBuffer(amd::Memory* vaddr_mem_obj);
+
   /**
    * Reserve a VA range with no backing store
    *
@@ -1830,6 +1873,29 @@ class Device : public RuntimeObject {
    */
   virtual void virtualFree(void* addr) = 0;
 
+  /**
+   * Export Shareable VMM Handle to FD
+   *
+   * @param hsa_handle hsa_handle which has the phys_mem info.
+   * @param flags any flags to be passed
+   * @param shareableHandle exported handle, points to fdesc.
+   */
+  virtual bool ExportShareableVMMHandle(uint64_t hsa_handle, int flags, void* shareableHandle) {
+    ShouldNotCallThis();
+    return false;
+  }
+
+  /**
+   * Import FD from Shareable VMM Handle
+   *
+   * @param osHandle os handle/fdesc
+   * @param hsa_handle_ptr hsa_handle which has the phys_mem info.
+   */
+  virtual bool ImportShareableVMMHandle(void* osHandle, uint64_t* hsa_handle_ptr) const {
+    ShouldNotCallThis();
+    return false;
+  }
+
   /**
    * @return True if the device successfully applied the SVM attributes in HMM for device memory
    */
diff --git a/rocclr/device/devkernel.cpp b/rocclr/device/devkernel.cpp
index b536f8a4c..475e96594 100644
--- a/rocclr/device/devkernel.cpp
+++ b/rocclr/device/devkernel.cpp
@@ -617,6 +617,7 @@ Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& p
   workGroupInfo_.compileSizeHint_[1] = 0;
   workGroupInfo_.compileSizeHint_[2] = 0;
   workGroupInfo_.compileVecTypeHint_ = "";
+  workGroupInfo_.isWGPMode_ = false;
   workGroupInfo_.uniformWorkGroupSize_ = false;
   workGroupInfo_.wavesPerSimdHint_ = 0;
   workGroupInfo_.constMemSize_ = 0;
diff --git a/rocclr/device/devprogram.cpp b/rocclr/device/devprogram.cpp
index 07d4914c2..c030e76b0 100644
--- a/rocclr/device/devprogram.cpp
+++ b/rocclr/device/devprogram.cpp
@@ -1172,10 +1172,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
     if (options->oVariables->FP32RoundDivideSqrt) {
         linkOptions.push_back("correctly_rounded_sqrt");
     }
-    if (options->oVariables->DenormsAreZero || AMD_GPU_FORCE_SINGLE_FP_DENORM == 0 ||
-        (device().isa().versionMajor() < 9 && AMD_GPU_FORCE_SINGLE_FP_DENORM < 0)) {
-        linkOptions.push_back("daz_opt");
-    }
     if (options->oVariables->FiniteMathOnly || options->oVariables->FastRelaxedMath) {
         linkOptions.push_back("finite_only");
     }
diff --git a/rocclr/device/pal/palblit.cpp b/rocclr/device/pal/palblit.cpp
index 14b77c83e..cc4c04539 100644
--- a/rocclr/device/pal/palblit.cpp
+++ b/rocclr/device/pal/palblit.cpp
@@ -2194,37 +2194,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
 
       // Program kernels arguments for the fill operation
       Memory* mem = &gpuMem(memory);
-      if (alignment == 2 * sizeof(uint64_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
-      } else if (alignment == sizeof(uint64_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      } else if (alignment == sizeof(uint32_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      } else if (alignment == sizeof(uint16_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      } else {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      }
+      setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem, koffset);
       const size_t localWorkSize = 256;
       size_t globalWorkSize =
           std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
@@ -2240,20 +2210,20 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
       }
       gpuCB.unmap(&gpu());
       Memory* pGpuCB = &gpuCB;
-      setArgument(kernels_[kFillType], 5, sizeof(cl_mem), &pGpuCB);
+      setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &pGpuCB);
       uint64_t offset = origin[0];
 
       // Adjust the pattern size in the copy type size
       kpattern_size /= alignment;
-      setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
-      koffset /= alignment;
-      setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
+      setArgument(kernels_[kFillType], 2, sizeof(uint32_t), &kpattern_size);
+      setArgument(kernels_[kFillType], 3, sizeof(alignment), &alignment);
+
       // Calculate max id
-      uint64_t end_ptr = memory.virtualAddress() +
-        (koffset + kfill_size * kpattern_size) * alignment;
-      setArgument(kernels_[kFillType], 8, sizeof(end_ptr), &end_ptr);
+      uint64_t end_ptr = memory.virtualAddress() + koffset +
+        kfill_size * kpattern_size * alignment;
+      setArgument(kernels_[kFillType], 4, sizeof(end_ptr), &end_ptr);
       uint32_t next_chunk = globalWorkSize * kpattern_size;
-      setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
+      setArgument(kernels_[kFillType], 5, sizeof(uint32_t), &next_chunk);
 
       // Create ND range object for the kernel's execution
       amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
@@ -2297,30 +2267,26 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
 
     // Program kernels arguments for the blit operation
     Memory* mem = &gpuMem(srcMemory);
-    setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem);
-    mem = &gpuMem(dstMemory);
-    setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem);
-
     // Program source origin
     uint64_t srcOffset = srcOrigin[0];
-    setArgument(kernels_[kBlitType], 2, sizeof(srcOffset), &srcOffset);
-
+    setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, srcOffset);
+    mem = &gpuMem(dstMemory);
     // Program destinaiton origin
     uint64_t dstOffset = dstOrigin[0];
-    setArgument(kernels_[kBlitType], 3, sizeof(dstOffset), &dstOffset);
+    setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, dstOffset);
 
     uint64_t copySize = sizeIn[0];
-    setArgument(kernels_[kBlitType], 4, sizeof(copySize), &copySize);
+    setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
 
-    setArgument(kernels_[kBlitType], 5, sizeof(remainder), &remainder);
-    setArgument(kernels_[kBlitType], 6, sizeof(aligned_size), &aligned_size);
+    setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
+    setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
 
     // End pointer is the aligned copy size and destination offset
     uint64_t end_ptr = dstMemory.virtualAddress() + dstOffset + sizeIn[0] - remainder;
-    setArgument(kernels_[kBlitType], 7, sizeof(end_ptr), &end_ptr);
+    setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
 
     uint32_t next_chunk = globalWorkSize;
-    setArgument(kernels_[kBlitType], 8, sizeof(next_chunk), &next_chunk);
+    setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
 
     // Create ND range object for the kernel's execution
     amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
diff --git a/rocclr/device/pal/palblitcl.cpp b/rocclr/device/pal/palblitcl.cpp
index 1fe13aa75..7b607024a 100644
--- a/rocclr/device/pal/palblitcl.cpp
+++ b/rocclr/device/pal/palblitcl.cpp
@@ -42,6 +42,7 @@ extern void __amd_scheduler_pal(__global void*, __global void*, uint);
 \n);
 
 const char* TrapHandlerCode = RUNTIME_KERNEL(
+\n.if .amdgcn.gfx_generation_number < 12
 \n.set SQ_WAVE_PC_HI_ADDRESS_MASK              , 0xFFFF
 \n.set SQ_WAVE_PC_HI_HT_SHIFT                  , 24
 \n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT             , 16
@@ -259,5 +260,187 @@ const char* TrapHandlerCode = RUNTIME_KERNEL(
 \n.parked:
 \n  s_trap               0x2
 \n  s_branch             .parked
+\n.else
+\n.set DOORBELL_ID_SIZE                          , 10
+\n.set DOORBELL_ID_MASK                          , ((1 << DOORBELL_ID_SIZE) - 1)
+\n.set EC_QUEUE_WAVE_ABORT_M0                    , (1 << (DOORBELL_ID_SIZE + 0))
+\n.set EC_QUEUE_WAVE_TRAP_M0                     , (1 << (DOORBELL_ID_SIZE + 1))
+\n.set EC_QUEUE_WAVE_MATH_ERROR_M0               , (1 << (DOORBELL_ID_SIZE + 2))
+\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0      , (1 << (DOORBELL_ID_SIZE + 3))
+\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0         , (1 << (DOORBELL_ID_SIZE + 4))
+\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0       , (1 << (DOORBELL_ID_SIZE + 5))
+\n.set SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT      , 4
+\n.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT           , 7
+\n.set SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT , 6
+\n.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT  , 8
+\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT    , 0
+\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE     , 6
+\n.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_SHIFT         , 0
+\n.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_SIZE          , 6
+\n.set SQ_WAVE_PC_HI_ADDRESS_MASK                , 0xFFFF
+\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE                 , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
+\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT               , 28
+\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE                , 4
+\n.set SQ_WAVE_STATE_PRIV_HALT_BFE               , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16))
+\n.set SQ_WAVE_STATE_PRIV_HALT_SHIFT             , 14
+\n.set TRAP_ID_ABORT                             , 2
+\n.set TRAP_ID_DEBUGTRAP                         , 3
+\n.set TTMP6_SAVED_STATUS_HALT_MASK              , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
+\n.set TTMP6_SAVED_STATUS_HALT_SHIFT             , 29
+\n.set TTMP6_SAVED_TRAP_ID_BFE                   , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
+\n.set TTMP6_SAVED_TRAP_ID_MASK                  , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
+\n.set TTMP6_SAVED_TRAP_ID_SHIFT                 , 25
+\n.set TTMP6_SAVED_TRAP_ID_SIZE                  , 4
+\n.set TTMP6_WAVE_STOPPED_SHIFT                  , 30
+\n.set TTMP8_DEBUG_FLAG_SHIFT                    , 31
+\n.set TTMP11_DEBUG_ENABLED_SHIFT                , 23
+\n.set TTMP_PC_HI_SHIFT                          , 7
+\n
+\n// ABI between first and second level trap handler:
+\n//   { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0]
+\n//   ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0]
+\n//   ttmp12 = SQ_WAVE_STATE_PRIV
+\n//   ttmp14 = TMA[31:0]
+\n//   ttmp15 = TMA[63:32]
+\n
+\ntrap_entry:
+\n  // Branch if not a trap (an exception instead).
+\n  s_bfe_u32            ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
+\n  s_cbranch_scc0       .no_skip_debugtrap
+\n
+\n  // If caused by s_trap then advance PC.
+\n  s_add_u32            ttmp0, ttmp0, 0x4
+\n  s_addc_u32           ttmp1, ttmp1, 0x0
+\n
+\n.not_s_trap:
+\n  // If llvm.debugtrap and debugger is not attached.
+\n  s_cmp_eq_u32         ttmp2, TRAP_ID_DEBUGTRAP
+\n  s_cbranch_scc0       .no_skip_debugtrap
+\n
+\n  s_bitcmp0_b32        ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
+\n  s_cbranch_scc0       .no_skip_debugtrap
+\n
+\n  // Ignore llvm.debugtrap.
+\n  s_branch             .exit_trap
+\n
+\n.no_skip_debugtrap:
+\n  // Save trap id and halt status in ttmp6.
+\n  s_andn2_b32          ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
+\n  s_min_u32            ttmp2, ttmp2, 0xF
+\n  s_lshl_b32           ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
+\n  s_or_b32             ttmp6, ttmp6, ttmp2
+\n  s_bfe_u32            ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE
+\n  s_lshl_b32           ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
+\n  s_or_b32             ttmp6, ttmp6, ttmp2
+\n
+\n  // Fetch doorbell id for our queue.
+\n  s_sendmsg_rtn_b32    ttmp3, sendmsg(MSG_RTN_GET_DOORBELL)
+\n  s_wait_kmcnt         0
+\n  s_and_b32            ttmp3, ttmp3, DOORBELL_ID_MASK
+\n
+\n  s_getreg_b32         ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV)
+\n
+\n  s_bitcmp1_b32        ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT
+\n  s_cbranch_scc0       .not_memory_violation
+\n  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
+\n
+\n  // Aperture violation requires XNACK_ERROR == 0.
+\n  s_branch             .not_aperture_violation
+\n
+\n.not_memory_violation:
+\n  s_bitcmp1_b32        ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT
+\n  s_cbranch_scc0       .not_aperture_violation
+\n  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
+\n
+\n.not_aperture_violation:
+\n  s_bitcmp1_b32        ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
+\n  s_cbranch_scc0       .not_illegal_instruction
+\n  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
+\n
+\n.not_illegal_instruction:
+\n  s_getreg_b32         ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE)
+\n  s_cbranch_scc0       .not_math_exception
+\n  s_getreg_b32         ttmp10, hwreg(HW_REG_TRAP_CTRL, SQ_WAVE_TRAP_CTRL_MATH_EXCP_SHIFT, SQ_WAVE_TRAP_CTRL_MATH_EXCP_SIZE)
+\n  s_and_b32            ttmp2, ttmp2, ttmp10
+\n
+\n  s_cbranch_scc0       .not_math_exception
+\n  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
+\n
+\n.not_math_exception:
+\n  s_bfe_u32            ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE
+\n  s_cmp_eq_u32         ttmp2, TRAP_ID_ABORT
+\n  s_cbranch_scc0       .not_abort_trap
+\n  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
+\n
+\n.not_abort_trap:
+\n  // If no other exception was flagged then report a generic error.
+\n  s_andn2_b32          ttmp2, ttmp3, DOORBELL_ID_MASK
+\n  s_cbranch_scc1       .send_interrupt
+\n  s_or_b32             ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
+\n
+\n.send_interrupt:
+\n  // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
+\n  s_mov_b32            ttmp2, m0
+\n  s_mov_b32            m0, ttmp3
+\n  s_nop                0x0 // Manually inserted wait states
+\n  s_sendmsg            sendmsg(MSG_INTERRUPT)
+\n  // Wait for the message to go out.
+\n  s_wait_kmcnt         0
+\n  s_mov_b32            m0, ttmp2
+\n
+\n  // Parking the wave requires saving the original pc in the preserved ttmps.
+\n  // Register layout before parking the wave:
+\n  //
+\n  // ttmp10: ?[31:0]
+\n  // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
+\n  //
+\n  // After parking the wave:
+\n  //
+\n  // ttmp10: pc_lo[31:0]
+\n  // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
+\n  //
+\n  // Save the PC
+\n  s_mov_b32            ttmp10, ttmp0
+\n  s_and_b32            ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
+\n  s_lshl_b32           ttmp1, ttmp1, TTMP_PC_HI_SHIFT
+\n  s_andn2_b32          ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
+\n  s_or_b32             ttmp11, ttmp11, ttmp1
+\n
+\n  // Park the wave
+\n  s_getpc_b64          [ttmp0, ttmp1]
+\n  s_add_u32            ttmp0, ttmp0, .parked - .
+\n  s_addc_u32           ttmp1, ttmp1, 0x0
+\n
+\n.halt_wave:
+\n  // Halt the wavefront upon restoring STATUS below.
+\n  s_bitset1_b32        ttmp6, TTMP6_WAVE_STOPPED_SHIFT
+\n  s_bitset1_b32        ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT
+\n
+\n  // Initialize TTMP registers
+\n  s_bitcmp1_b32        ttmp8, TTMP8_DEBUG_FLAG_SHIFT
+\n  s_cbranch_scc1       .ttmps_initialized
+\n  s_mov_b32            ttmp4, 0
+\n  s_mov_b32            ttmp5, 0
+\n  s_bitset1_b32        ttmp8, TTMP8_DEBUG_FLAG_SHIFT
+\n.ttmps_initialized:
+\n
+\n.exit_trap:
+\n  // Restore SQ_WAVE_STATUS.
+\n  s_and_b64            exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+\n  s_and_b64            vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
+\n  s_setreg_b32         hwreg(HW_REG_STATE_PRIV), ttmp12
+\n
+\n  // Return to original (possibly modified) PC.
+\n  s_rfe_b64            [ttmp0, ttmp1]
+\n
+\n.parked:
+\n  s_trap               0x2
+\n  s_branch             .parked
+\n
+\n// Add s_code_end padding so instruction prefetch always has something to read.
+\n//.rept (256 - ((. - trap_entry) % 64)) / 4
+\n 64 s_code_end
+\n//.endr
+\n.endif
 \n);
 }  // namespace pal
diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp
index 91424d7fb..d4fc5b538 100644
--- a/rocclr/device/pal/paldevice.cpp
+++ b/rocclr/device/pal/paldevice.cpp
@@ -113,6 +113,12 @@ static constexpr PalDevice supportedPalDevices[] = {
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::Phoenix2},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::HawkPoint1},
   {11, 0,  3,  Pal::GfxIpLevel::GfxIp11_0, "gfx1103",       Pal::AsicRevision::HawkPoint2},
+#if PAL_BUILD_NAVI44
+  {12, 0,  0,  Pal::GfxIpLevel::GfxIp12,   "gfx1200",       Pal::AsicRevision::Navi44},
+#endif
+#if PAL_BUILD_NAVI48
+  {12, 0,  1,  Pal::GfxIpLevel::GfxIp12,   "gfx1201",       Pal::AsicRevision::Navi48},
+#endif
 };
 
 static std::tuple<const amd::Isa*, const char*> findIsa(Pal::AsicRevision asicRevision,
@@ -510,7 +516,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
   info_.errorCorrectionSupport_ = false;
 
   if (settings().apuSystem_) {
-    info_.hostUnifiedMemory_ = true;
+    info_.hostUnifiedMemory_ = 1;
   }
 
   info_.iommuv2_ = palProp.gpuMemoryProperties.flags.iommuv2Support;
@@ -663,12 +669,16 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
     // Enable StreamWrite and StreamWait for all devices
     info_.aqlBarrierValue_ = true;
 
+#if defined(_WIN64)
     if (amd::IS_HIP) {
       info_.largeBar_ = false;
     } else if (heaps[Pal::GpuHeapInvisible].logicalSize == 0) {
       info_.largeBar_ = true;
       ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Resizable bar enabled");
     }
+#else   // !_WIN64
+    info_.largeBar_ = false;
+#endif  // _WIN64
   }
   info_.virtualMemoryManagement_ = true;
   info_.virtualMemAllocGranularity_ =
@@ -1286,10 +1296,50 @@ typedef std::unordered_map<int, bool> requestedDevices_t;
 
 //! Parses the requested list of devices to be exposed to the user.
 static void parseRequestedDeviceList(const char* requestedDeviceList,
-                                     requestedDevices_t& requestedDevices, uint32_t numDevices) {
+                                     requestedDevices_t& requestedDevices, uint32_t numDevices,
+                                     Pal::IDevice* deviceList[Pal::MaxDevices]) {
   char* pch = strtok(const_cast<char*>(requestedDeviceList), ",");
   while (pch != nullptr) {
     bool deviceIdValid = true;
+    // UUID needs to be specified in the format GPU-<body>, <body> encodes UUID as a 16 chars
+    char* deviceUuid = strstr(pch, "GPU-");
+    // If Uuid is specified, then convert it to index
+    if (deviceUuid != nullptr) {
+      for (uint32_t i = 0; i < numDevices; i++) {
+        Pal::DeviceProperties properties;
+        // Retrieve device properties
+        Pal::Result result = deviceList[i]->GetProperties(&properties);
+        if (result != Pal::Result::Success) {
+          continue;
+        }
+
+        // Retrieve uuid
+        char uuid[17] = {0};
+        for (int j = 0; j < 4; j++) {
+          itoa((reinterpret_cast<char*>(&properties.pciProperties.domainNumber))[j],
+                &uuid[j], 10);
+        }
+        for (int j = 0; j < 4; j++) {
+          itoa((reinterpret_cast<char*>(&properties.pciProperties.busNumber))[j],
+                &uuid[j + 4], 10);
+        }
+        for (int j = 0; j < 4; j++) {
+          itoa((reinterpret_cast<char*>(&properties.pciProperties.deviceNumber))[j],
+                &uuid[j + 8], 10);
+        }
+        for (int j = 0; j < 4; j++) {
+          itoa((reinterpret_cast<char*>(&properties.pciProperties.functionNumber))[j],
+                &uuid[j + 12], 10);
+        }
+
+        // Convert it to index
+        if (strcmp(pch + 4, uuid) == 0) {
+          snprintf(pch, strlen(pch), "%d", i);
+          break;
+        }
+      }
+    }
+
     int currentDeviceIndex = atoi(pch);
     // Validate device index.
     for (size_t i = 0; i < strlen(pch); i++) {
@@ -1374,7 +1424,7 @@ bool Device::init() {
 
   if (requestedDeviceList[0] != '\0') {
     useDeviceList = true;
-    parseRequestedDeviceList(requestedDeviceList, requestedDevices, gNumDevices);
+    parseRequestedDeviceList(requestedDeviceList, requestedDevices, gNumDevices, &gDeviceList[0]);
   }
 
   bool foundDevice = false;
@@ -2439,31 +2489,21 @@ void Device::svmFree(void* ptr) const {
 
 // ================================================================================================
 void* Device::virtualAlloc(void* addr, size_t size, size_t alignment) {
-  // create a hidden buffer, which will allocated on the device later
-  auto mem = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_VA_RANGE_AMD, size, addr);
-  if (mem == nullptr) {
-    LogError("failed to new a va range mem object!");
-    return nullptr;
-  }
-
-  constexpr bool kSysMemAlloc = false;
-  constexpr bool kSkipAlloc = false;
-  constexpr bool kForceAlloc = true;
-  // Force the alloc now for VA_Range reservation.
-  if (!mem->create(nullptr, kSysMemAlloc, kSkipAlloc, kForceAlloc)) {
-    LogError("failed to create a va range mem object");
-    mem->release();
-    return nullptr;
-  }
-
+  amd::Memory* mem = CreateVirtualBuffer(context(), addr, size, -1, true, true);
+  assert(mem != nullptr);
   return mem->getSvmPtr();
 }
 
 // ================================================================================================
 void Device::virtualFree(void* addr) {
-  auto va = amd::MemObjMap::FindVirtualMemObj(addr);
-  if (nullptr != va) {
-    va->release();
+  auto vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(addr);
+  if (vaddr_mem_obj == nullptr) {
+    LogPrintfError("Cannot find any mem_obj for addr: 0x%x \n", addr);
+    return;
+  }
+
+  if (!vaddr_mem_obj->getContext().devices()[0]->DestroyVirtualBuffer(vaddr_mem_obj)) {
+    LogPrintfError("Cannot destroy mem_obj:0x%x for addr: 0x%x \n", vaddr_mem_obj, addr);
   }
 }
 
@@ -2712,7 +2752,8 @@ bool Device::importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& ha
       (sem_handle_type == amd::ExternalSemaphoreHandleType::TimelineSemaphoreWin32 ||
        sem_handle_type == amd::ExternalSemaphoreHandleType::TimelineSemaphoreFd);
   palOpenInfo.flags.sharedViaNtHandle =
-      (sem_handle_type == amd::ExternalSemaphoreHandleType::OpaqueWin32);
+      (sem_handle_type == amd::ExternalSemaphoreHandleType::OpaqueWin32 ||
+       sem_handle_type == amd::ExternalSemaphoreHandleType::D3D12Fence);
   Pal::Result result;
 
   size_t semaphoreSize = iDev()->GetExternalSharedQueueSemaphoreSize(
diff --git a/rocclr/device/pal/palmemory.cpp b/rocclr/device/pal/palmemory.cpp
index c90725efe..e423642b0 100644
--- a/rocclr/device/pal/palmemory.cpp
+++ b/rocclr/device/pal/palmemory.cpp
@@ -200,7 +200,7 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
     }
     if (memRef() != nullptr) {
       ClPrint(amd::LOG_DEBUG, amd::LOG_RESOURCE,
-              "Alloc: %llx bytes, ptr[%p-%p], obj[%p-%p]",
+              "Alloc: %zx bytes, ptr[%llx-%llx], obj[%llx-%llx]",
               size(),
               vmAddress(),
               vmAddress() + size(),
diff --git a/rocclr/device/pal/palprintf.cpp b/rocclr/device/pal/palprintf.cpp
index d4e0fb6ba..c1fea4aae 100644
--- a/rocclr/device/pal/palprintf.cpp
+++ b/rocclr/device/pal/palprintf.cpp
@@ -246,7 +246,7 @@ static constexpr size_t ConstStr = 0xffffffff;
 static constexpr char Separator[] = ",\0";
 
 size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t size,
-                                 const uint32_t* argument) const {
+                                 const void* argument) const {
   // Serialize the output to the screen
   amd::ScopedLock k(dev().lockAsyncOps());
 
@@ -256,7 +256,7 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
     // copiedBytes should be as number of printed chars
     copiedBytes = 0;
     //(null) should be printed
-    if (*argument == 0) {
+    if (*(reinterpret_cast<const unsigned char*>(argument)) == 0) {
       amd::Os::printf(fmt.data(), 0);
       // copiedBytes = strlen("(null)")
       copiedBytes = 6;
@@ -291,11 +291,9 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
       case 2:
       case 4:
         if (printFloat) {
-          uint32_t arg = *argument;
-          if (size == 2) {
-            auto p = reinterpret_cast<const uint16_t*>(argument);
-            amd::half2float(*p, &arg);
-          }
+          const float fArg = size == 2 ?
+                            amd::half2float(*(reinterpret_cast<const uint16_t *>(argument))) :
+                            *(reinterpret_cast<const float *>(argument));
           static const char* fSpecifiers = "eEfgGa";
           std::string fmtF = fmt;
           size_t posS = fmtF.find_first_of("%");
@@ -303,7 +301,6 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
           if (posS != std::string::npos && posE != std::string::npos) {
             fmtF.replace(posS + 1, posE - posS, "s");
           }
-          float fArg = *(reinterpret_cast<const float*>(&arg));
           float fSign = copysign(1.0, fArg);
           if (isinf(fArg) && !isnan(fArg)) {
             if (fSign < 0) {
@@ -333,9 +330,13 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
             hhFmt.erase(hhFmt.find_first_of("h"), 2);
             amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
           } else if (hlModifier) {
-            amd::Os::printf(hlFmt.data(), *argument);
+            amd::Os::printf(hlFmt.data(), size == 2 ?
+                *(reinterpret_cast<const uint16_t *>(argument)):
+                *(reinterpret_cast<const uint32_t *>(argument)));
           } else {
-            amd::Os::printf(fmt.data(), *argument);
+            amd::Os::printf(fmt.data(), size == 2 ?
+                *(reinterpret_cast<const uint16_t *>(argument)):
+                *(reinterpret_cast<const uint32_t *>(argument)));
           }
         }
         break;
@@ -403,13 +404,13 @@ void PrintfDbg::outputDbgBuffer(const device::PrintfInfo& info, const uint32_t*
           fmt = str.substr(pos, posEnd - pos);
           fmt.erase(posStart - pos - 1, 1);
           pos = posStart = posEnd;
-          outputArgument(sepStr, false, ConstStr, reinterpret_cast<const uint32_t*>(fmt.data()));
+          outputArgument(sepStr, false, ConstStr, fmt.data());
           continue;
         }
         break;
       } else if (pos < str.length()) {
         outputArgument(sepStr, false, ConstStr,
-                       reinterpret_cast<const uint32_t*>((str.substr(pos)).data()));
+                       str.substr(pos).data());
       }
     } while (posStart != std::string::npos);
 
@@ -473,11 +474,11 @@ void PrintfDbg::outputDbgBuffer(const device::PrintfInfo& info, const uint32_t*
           const char* t = reinterpret_cast<const char*>(s);
 
           // Output the vector separator
-          outputArgument(sepStr, false, ConstStr, reinterpret_cast<const uint32_t*>(Separator));
+          outputArgument(sepStr, false, ConstStr, Separator);
 
           // Output the next element
           outputArgument(elementStr, printFloat, elemSize,
-                         reinterpret_cast<const uint32_t*>(&t[k + e * elemSize]));
+                         &t[k + e * elemSize]);
         }
         i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / sizeof(uint32_t);
       }
@@ -486,7 +487,7 @@ void PrintfDbg::outputDbgBuffer(const device::PrintfInfo& info, const uint32_t*
 
   if (pos != std::string::npos) {
     fmt = str.substr(pos, str.size() - pos);
-    outputArgument(sepStr, false, ConstStr, reinterpret_cast<const uint32_t*>(fmt.data()));
+    outputArgument(sepStr, false, ConstStr, fmt.data());
   }
 }
 
diff --git a/rocclr/device/pal/palprintf.hpp b/rocclr/device/pal/palprintf.hpp
index 5c7b899c4..add01934f 100644
--- a/rocclr/device/pal/palprintf.hpp
+++ b/rocclr/device/pal/palprintf.hpp
@@ -118,7 +118,7 @@ class PrintfDbg : public amd::HeapObject {
   size_t outputArgument(const std::string& fmt,   //!< Format strint
                         bool printFloat,          //!< Argument is a float value
                         size_t size,              //!< Argument's size
-                        const uint32_t* argument  //!< Argument's location
+                        const void* argument      //!< Argument's location
                         ) const;
 
   //! Displays the PrintfDbg
diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp
index 665ebf403..29e845b6d 100644
--- a/rocclr/device/pal/palprogram.cpp
+++ b/rocclr/device/pal/palprogram.cpp
@@ -139,10 +139,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
 
 void Segment::copy(size_t offset, const void* src, size_t size) {
   if (cpuAccess_ != nullptr) {
-    amd::Os::fastMemcpy(cpuAddress(offset), src, size);
+    std::memcpy(cpuAddress(offset), src, size);
   } else {
     if (cpuMem_ != nullptr) {
-      amd::Os::fastMemcpy(cpuAddress(offset), src, size);
+      std::memcpy(cpuAddress(offset), src, size);
     }
     amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
     VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
@@ -568,7 +568,7 @@ void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_ag
 bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
                                       void* dst, size_t offset, const void* src, size_t size) {
   if (program_->isNull()) {
-    amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + offset, src, size);
+    std::memcpy(reinterpret_cast<address>(dst) + offset, src, size);
     return true;
   }
   Segment* s = reinterpret_cast<Segment*>(dst);
diff --git a/rocclr/device/pal/palresource.cpp b/rocclr/device/pal/palresource.cpp
index 5f8a86375..dd801157b 100644
--- a/rocclr/device/pal/palresource.cpp
+++ b/rocclr/device/pal/palresource.cpp
@@ -255,23 +255,26 @@ GpuMemoryReference::~GpuMemoryReference() {
   if (nullptr == iMem()) {
     return;
   }
-  if (gpu_ == nullptr) {
-    Device::ScopedLockVgpus lock(device_);
-    // Release all memory objects on all virtual GPUs
-    for (uint idx = 1; idx < device_.vgpus().size(); ++idx) {
-      device_.vgpus()[idx]->releaseMemory(this);
+  // Memory tracking per queue is disabled if alwaysResident is enabled. Thus, runtime can skip
+  // updating residency state per every queue
+  if (!device_.settings().alwaysResident_) {
+    if (gpu_ == nullptr) {
+      Device::ScopedLockVgpus lock(device_);
+      // Release all memory objects on all virtual GPUs
+      for (uint idx = 1; idx < device_.vgpus().size(); ++idx) {
+        device_.vgpus()[idx]->releaseMemory(this);
+      }
+    } else {
+      amd::ScopedLock l(gpu_->execution());
+      gpu_->releaseMemory(this);
+    }
+    if (device_.vgpus().size() != 0) {
+      assert(device_.vgpus()[0] == device_.xferQueue() && "Wrong transfer queue!");
+      // Lock the transfer queue, since it's not handled by ScopedLockVgpus
+      amd::ScopedLock k(device_.xferMgr().lockXfer());
+      device_.vgpus()[0]->releaseMemory(this);
     }
-  } else {
-    amd::ScopedLock l(gpu_->execution());
-    gpu_->releaseMemory(this);
-  }
-  if (device_.vgpus().size() != 0) {
-    assert(device_.vgpus()[0] == device_.xferQueue() && "Wrong transfer queue!");
-    // Lock the transfer queue, since it's not handled by ScopedLockVgpus
-    amd::ScopedLock k(device_.xferMgr().lockXfer());
-    device_.vgpus()[0]->releaseMemory(this);
   }
-
   // Destroy PAL object if it's not a suballocation
   if (cpuAddress_ != nullptr) {
     iMem()->Unmap();
@@ -1382,13 +1385,12 @@ void Resource::free() {
   // and resource can be reused on another async queue without a wait on a busy operation
   if (wait) {
     if (memRef_->gpu_ == nullptr) {
-      Device::ScopedLockVgpus lock(dev());
+      amd::ScopedLock l(dev().vgpusAccess());
       // Release all memory objects on all virtual GPUs
       for (uint idx = 1; idx < dev().vgpus().size(); ++idx) {
         dev().vgpus()[idx]->waitForEvent(&events_[idx]);
       }
     } else {
-      amd::ScopedLock l(memRef_->gpu_->execution());
       memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]);
     }
   } else {
@@ -1512,21 +1514,21 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
        (size[0] < dev().settings().cpDmaCopySizeMax_));
   if (cp_dma) {
     // Make sure compute is done before CP DMA start
-    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
+    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::KernelToCopy);
   } else {
     gpu.releaseGpuMemoryFence();
     gpu.engineID_ = SdmaEngine;
+
+    if (gpu.validateSdmaOverlap(*this, dstResource)) {
+      // Note: PAL should insert a NOP into the command buffer for synchronization
+      gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::CopyToCopy);
+    }
   }
 
   // Wait for the resources, since runtime may use async transfers
   wait(gpu, waitOnBusyEngine);
   dstResource.wait(gpu, waitOnBusyEngine);
 
-  if (gpu.validateSdmaOverlap(*this, dstResource)) {
-    // Note: PAL should insert a NOP into the command buffer for synchronization
-    gpu.addBarrier();
-  }
-
   Pal::ImageLayout imgLayout = {};
   gpu.eventBegin(gpu.engineID_);
   gpu.queue(gpu.engineID_).addCmdMemRef(memRef());
@@ -1626,7 +1628,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
 
   if (cp_dma) {
     // Make sure CP dma is done
-    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
+    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::CopyToKernel);
   }
 
   gpu.eventEnd(gpu.engineID_, event);
@@ -1698,7 +1700,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3
     dst = static_cast<void*>(static_cast<char*>(dst) + origin[0]);
 
     // Copy memory
-    amd::Os::fastMemcpy(dst, hostPtr, copySize);
+    std::memcpy(dst, hostPtr, copySize);
   } else {
     size_t dstOffsBase = origin[0] * elementSize_;
 
@@ -1726,7 +1728,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3
       // Copy memory line by line
       for (size_t row = 0; row < size[1]; ++row) {
         // Copy memory
-        amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
+        std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
                             (reinterpret_cast<const_address>(hostPtr) + srcOffs),
                             size[0] * elementSize_);
 
@@ -1768,7 +1770,7 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig
     src = static_cast<void*>(static_cast<char*>(src) + origin[0]);
 
     // Copy memory
-    amd::Os::fastMemcpy(hostPtr, src, copySize);
+    std::memcpy(hostPtr, src, copySize);
   } else {
     size_t srcOffsBase = origin[0] * elementSize_;
 
@@ -1796,9 +1798,9 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig
       // Copy memory line by line
       for (size_t row = 0; row < size[1]; ++row) {
         // Copy memory
-        amd::Os::fastMemcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
-                            (reinterpret_cast<const_address>(src) + srcOffs),
-                            size[0] * elementSize_);
+        std::memcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
+                    (reinterpret_cast<const_address>(src) + srcOffs),
+                    size[0] * elementSize_);
 
         srcOffs += desc().pitch_ * elementSize_;
         dstOffs += rowPitch;
@@ -1937,7 +1939,7 @@ bool Resource::isPersistentDirectMap(bool writeMap) const {
   if (directMap && desc().tiled_) {
     // Latest HW does have tiling apertures
     directMap = false;
-  } 
+  }
   if (memoryType() == View) {
     directMap = viewOwner_->isPersistentDirectMap(writeMap);
   }
diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp
index c68eaff6f..eadf9c947 100644
--- a/rocclr/device/pal/palsettings.cpp
+++ b/rocclr/device/pal/palsettings.cpp
@@ -143,7 +143,9 @@ Settings::Settings() {
   alwaysResident_ = amd::IS_HIP ? true : false;
   prepinnedMinSize_ = 0;
   cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
-  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG;
+  kernel_arg_impl_ = flagIsDefault(HIP_FORCE_DEV_KERNARG)
+                         ? KernelArgImpl::DeviceKernelArgs
+                         : HIP_FORCE_DEV_KERNARG;
 
   limit_blit_wg_ = 16;
   DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL
@@ -171,6 +173,12 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
   amd::Os::getAppPathAndFileName(appName, appPathAndName);
 
   switch (palProp.revision) {
+#if PAL_BUILD_NAVI48
+    case Pal::AsicRevision::Navi48:
+#endif
+#if PAL_BUILD_NAVI44
+    case Pal::AsicRevision::Navi44:
+#endif
     // Fall through for Navi3x ...
     case Pal::AsicRevision::Navi33:
     case Pal::AsicRevision::Navi32:
@@ -201,11 +209,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
     case Pal::AsicRevision::Navi10_A0:
       gfx10Plus_ = true;
       useLightning_ = GPU_ENABLE_LC;
-      // Force luxmark to use HSAIL for gfx10+ if GPU_ENABLE_LC isn't set in ENV
-      if (flagIsDefault(GPU_ENABLE_LC) &&
-          (appName == "luxmark.exe" || appName == "luxmark")) {
-        useLightning_ = false;
-      }
       enableWgpMode_ = GPU_ENABLE_WGP_MODE;
       if (useLightning_) {
         enableWave32Mode_ = true;
diff --git a/rocclr/device/pal/palsettings.hpp b/rocclr/device/pal/palsettings.hpp
index 942c5c91b..e3cd2c7f3 100644
--- a/rocclr/device/pal/palsettings.hpp
+++ b/rocclr/device/pal/palsettings.hpp
@@ -79,8 +79,7 @@ class Settings : public device::Settings {
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint disableSdma_ : 1;            //!< Disable SDMA support
       uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint useDeviceKernelArg_ : 1;     //!< Use persistent memory for kernel arguments
-      uint reserved_ : 9;
+      uint reserved_ : 10;
     };
     uint value_;
   };
@@ -139,6 +138,8 @@ class Settings : public device::Settings {
 
   //! Overrides current settings based on registry/environment
   void override();
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 
 /*@}*/  // namespace pal
diff --git a/rocclr/device/pal/palsignal.cpp b/rocclr/device/pal/palsignal.cpp
index 9b5efaaef..a9e6ae442 100644
--- a/rocclr/device/pal/palsignal.cpp
+++ b/rocclr/device/pal/palsignal.cpp
@@ -28,7 +28,7 @@
 namespace pal {
 
 Signal::~Signal() {
-  dev_->context().svmFree(amdSignal_);
+  dev_->GlbCtx().svmFree(amdSignal_);
 
   if (ws_ == device::Signal::WaitState::Blocked) {
 #if defined(_WIN32)
@@ -50,7 +50,7 @@ bool Signal::Init(const amd::Device& dev, uint64_t init, device::Signal::WaitSta
   dev_ = static_cast<const pal::Device*>(&dev);
   ws_ = ws;
 
-  void* buffer = dev_->context().svmAlloc(sizeof(amd_signal_t), alignof(amd_signal_t),
+  void* buffer = dev_->GlbCtx().svmAlloc(sizeof(amd_signal_t), alignof(amd_signal_t),
                                           CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS);
   if (!buffer) {
     ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE,
diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp
index a19d3aa1c..a1aaa86ce 100644
--- a/rocclr/device/pal/palvirtual.cpp
+++ b/rocclr/device/pal/palvirtual.cpp
@@ -408,7 +408,7 @@ bool VirtualGPU::Queue::flush() {
   submitInfo.ppFences             = &iCmdFences_[cmdBufIdSlot_];
 
   if (iQueue_->Type() == Pal::QueueTypeCompute) {
-    if (settings.useDeviceKernelArg_) {
+    if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
       // If runtime uses device memory for kernel arguments, then perform a CPU read back on
       // submission. That will make sure NBIO puches all previous CPU write requests through PCIE
       gpu_.managedBuffer().CpuReadBack();
@@ -955,10 +955,12 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
   }
 
   // Create buffers for kernel arg management
-  if (!managedBuffer_.create(
-      dev().settings().useDeviceKernelArg_ ? Resource::Persistent : Resource::RemoteUSWC)) {
+  if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ ==
+                                     KernelArgImpl::DeviceKernelArgs
+                                 ? Resource::Persistent
+                                 : Resource::RemoteUSWC)) {
     // Try just USWC if persistent memory failed
-    if (dev().settings().useDeviceKernelArg_) {
+    if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
       if (!managedBuffer_.create(Resource::RemoteUSWC)) {
         return false;
       }
@@ -1154,7 +1156,7 @@ VirtualGPU::~VirtualGPU() {
             "deleting hostcall buffer %p for virtual queue %p",
             hostcallBuffer_, this);
     disableHostcalls(hostcallBuffer_);
-    dev().context().svmFree(hostcallBuffer_);
+    dev().svmFree(hostcallBuffer_);
   }
 }
 
@@ -1567,7 +1569,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
     }
 
     if (nullptr == srcMem && nullptr == dstMem) {  // both not in svm space
-      amd::Os::fastMemcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
+      std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
       result = true;
     } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
       Memory* memory = dev().getGpuMemory(dstMem);
@@ -2228,11 +2230,15 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
       assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
       amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
       vaddr_mem_obj->getUserData().phys_mem_obj = vcmd.memory();
+      vcmd.memory()->getUserData().vaddr_mem_obj = vaddr_mem_obj;
     } else {
       // assert the vaddr_mem_obj is mapped and needs to be removed
       assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
       amd::MemObjMap::RemoveMemObj(vcmd.ptr());
-      vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
+      if (vaddr_mem_obj->getUserData().phys_mem_obj != nullptr) {
+        vaddr_mem_obj->getUserData().phys_mem_obj->getUserData().vaddr_mem_obj = nullptr;
+        vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
+      }
     }
   }
   profilingEnd(vcmd);
@@ -2404,8 +2410,7 @@ void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel&
   static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
       .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
                     gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
-  const static bool FlushL2 = true;
-  gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);
+  gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, BarrierType::FlushL2);
 
   // Get the address of PM4 template and add write it to params
   //! @note DMA flush must not occur between patch and the scheduler
@@ -3020,8 +3025,7 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
     engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);
 
     // Make sure GPU finished operation and data reached memory before the marker write
-    static constexpr bool FlushL2 = true;
-    addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
+    addBarrier(RgpSqqtBarrierReason::SignalSubmit, BarrierType::FlushL2);
     // Workarounds: We had systems where an extra delay was necessary.
     {
       // Flush CB associated with the DGMA buffer
@@ -3319,7 +3323,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
       return;
     }
     // Save the TimeStamp object in the current OCL event
-    command.setData(ts);
+    command.data().emplace_back(ts);
     profileTs_ = ts;
     state_.profileEnabled_ = true;
   }
@@ -3327,7 +3331,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
 
 void VirtualGPU::profilingEnd(amd::Command& command) {
   // Get the TimeStamp object associated witht the current command
-  TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
+  TimeStamp* ts = !command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back())
+                                            : nullptr;
   if (ts != nullptr) {
     // Check if the command actually did any GPU submission
     if (ts->isValid()) {
@@ -3335,7 +3340,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
     } else {
       // Destroy the TimeStamp object
       tsCache_->freeTimeStamp(ts);
-      command.setData(nullptr);
+      command.data().clear();
     }
   }
 }
@@ -3364,7 +3369,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
   first = cb->head_;
   while (nullptr != first) {
     // Get the TimeStamp object associated witht the current command
-    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+    TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
+                                            : nullptr;
 
     if (ts != nullptr) {
       ts->value(&startTimeStamp, &endTimeStamp);
@@ -3381,7 +3387,8 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
   first = cb->head_;
   while (nullptr != first) {
     // Get the TimeStamp object associated witht the current command
-    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+    TimeStamp* ts = !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back())
+                                            : nullptr;
 
     current = first->getNext();
 
@@ -3391,7 +3398,7 @@ bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* wai
       startTimeStamp -= readjustTimeGPU_;
       // Destroy the TimeStamp object
       tsCache_->freeTimeStamp(ts);
-      first->setData(nullptr);
+      first->data().clear();
     } else {
       // For empty commands start/end is equal to
       // the end of the last valid command
@@ -3765,7 +3772,8 @@ void* VirtualGPU::getOrCreateHostcallBuffer() {
   auto size = getHostcallBufferSize(numPackets);
   auto align = getHostcallBufferAlignment();
 
-  hostcallBuffer_ = dev().context().svmAlloc(size, align, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS);
+  hostcallBuffer_ = dev().svmAlloc(dev().context(), size, align,
+                                   CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, nullptr);
   if (!hostcallBuffer_) {
     ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE,
             "Failed to create hostcall buffer");
diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp
index 170753426..58bdb307b 100644
--- a/rocclr/device/pal/palvirtual.hpp
+++ b/rocclr/device/pal/palvirtual.hpp
@@ -66,6 +66,14 @@ struct AqlPacketMgmt : public amd::EmbeddedObject {
   std::atomic<uint64_t> packet_index_;          //!< The active packet slot index
 };
 
+ enum class BarrierType : uint8_t {
+   KernelToKernel = 0,
+   KernelToCopy,
+   CopyToKernel,
+   CopyToCopy,
+   FlushL2
+};
+
 //! Virtual GPU
 class VirtualGPU : public device::VirtualDevice {
  public:
@@ -478,18 +486,29 @@ class VirtualGPU : public device::VirtualDevice {
   //! Returns queue, associated with VirtualGPU
   Queue& queue(EngineType id) const { return *queues_[id]; }
 
-  void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
-                  bool flushL2 = false) const {
+  void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::MemDependency,
+                  BarrierType type = BarrierType::KernelToKernel) const {
     Pal::BarrierInfo barrier = {};
     barrier.pipePointWaitCount = 1;
     Pal::HwPipePoint point = Pal::HwPipePostCs;
     barrier.pPipePoints = &point;
     barrier.transitionCount = 1;
-    uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
-    Pal::BarrierTransition trans = {
-        cacheMask,
-        cacheMask,
-        {nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
+    Pal::BarrierTransition trans = {};
+    trans.srcCacheMask = Pal::CoherShader;
+    trans.dstCacheMask = Pal::CoherShader;
+    trans.imageInfo.oldLayout.usages = Pal::LayoutShaderRead;
+    trans.imageInfo.oldLayout.engines = Pal::LayoutComputeEngine;
+    trans.imageInfo.newLayout.usages = Pal::LayoutShaderRead;
+    trans.imageInfo.newLayout.engines = Pal::LayoutComputeEngine;
+    if (type == BarrierType::KernelToCopy) {
+      trans.dstCacheMask = Pal::CoherCopy;
+    } else if (type == BarrierType::CopyToKernel) {
+      trans.srcCacheMask = Pal::CoherCopy;
+    } else if (type == BarrierType::CopyToCopy) {
+      trans.dstCacheMask = trans.srcCacheMask = Pal::CoherCopy;
+    } else if (type == BarrierType::FlushL2) {
+      trans.dstCacheMask = trans.srcCacheMask = Pal::CoherCopy | Pal::CoherCpu;
+    }
     barrier.pTransitions = &trans;
     barrier.waitPoint = Pal::HwPipePreCs;
     barrier.reason = static_cast<uint32_t>(reason);
@@ -702,6 +721,8 @@ class VirtualGPU : public device::VirtualDevice {
   MemoryRange sdmaRange_;                   //!< SDMA memory range for write access
 
   void* hostcallBuffer_;  //!< Hostcall buffer
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 
 inline void VirtualGPU::logVmMemory(const std::string name, const Memory* memory) {
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index 63dfe41ce..ddb458a05 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -33,7 +33,9 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
       MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_),
       completeOperation_(false),
       context_(nullptr),
-      sdmaEngineRetainCount_(0) {}
+      sdmaEngineRetainCount_(0) {
+        dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_);
+      }
 
 inline void DmaBlitManager::synchronize() const {
   if (syncOperation_) {
@@ -731,6 +733,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
         // Check if there a recently used SDMA engine for the stream
         copyMask = gpu().getLastUsedSdmaEngine();
         ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
+        copyMask &= (engine == HwQueueEngine::SdmaRead ?
+                    sdmaEngineReadMask_ : sdmaEngineWriteMask_);
       }
       if (copyMask == 0) {
         // Check SDMA engine status
@@ -1942,8 +1946,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
   bool result = false;
 
   // Use host copy if memory has direct access
-  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
-      gpuMem(dstMemory).IsPersistentDirectMap() && !setup_.disableHostCopyBuffer_) {
+  if ((setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
+      gpuMem(dstMemory).IsPersistentDirectMap()) && !setup_.disableHostCopyBuffer_) {
     // Stall GPU before CPU access
     gpu().releaseGpuMemoryFence();
     result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
@@ -2104,37 +2108,7 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
                            (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
       // Program kernels arguments for the fill operation
       cl_mem mem = as_cl<amd::Memory>(memory.owner());
-      if (alignment == 2 * sizeof(uint64_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
-      } else if (alignment == sizeof(uint64_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      } else if (alignment == sizeof(uint32_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      } else if (alignment == sizeof(uint16_t)) {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      } else {
-        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
-        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
-      }
+      setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem, koffset);
       const size_t localWorkSize = 256;
       size_t globalWorkSize =
           std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
@@ -2149,18 +2123,18 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
         memcpy(constBuf, pattern, kpattern_size);
       }
       constexpr bool kDirectVa = true;
-      setArgument(kernels_[kFillType], 5, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
+      setArgument(kernels_[kFillType], 1, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
 
       // Adjust the pattern size in the copy type size
       kpattern_size /= alignment;
-      setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
-      koffset /= alignment;
-      setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
+      setArgument(kernels_[kFillType], 2, sizeof(uint32_t), &kpattern_size);
+      setArgument(kernels_[kFillType], 3, sizeof(alignment), &alignment);
+
       // Calculate max id
-      kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
-      setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
+      kfill_size = memory.virtualAddress() + koffset + kfill_size * kpattern_size * alignment;
+      setArgument(kernels_[kFillType], 4, sizeof(kfill_size), &kfill_size);
       uint32_t next_chunk = globalWorkSize * kpattern_size;
-      setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
+      setArgument(kernels_[kFillType], 5, sizeof(uint32_t), &next_chunk);
 
       // Create ND range object for the kernel's execution
       amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
@@ -2347,31 +2321,27 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
 
     // Program kernels arguments for the blit operation
     cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
-    setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, 0, &srcMemory);
-    mem = as_cl<amd::Memory>(dstMemory.owner());
-    setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, 0, &dstMemory);
-
     // Program source origin
     uint64_t srcOffset = srcOrigin[0];
-    setArgument(kernels_[kBlitType], 2, sizeof(srcOffset), &srcOffset);
-
+    setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, srcOffset, &srcMemory);
+    mem = as_cl<amd::Memory>(dstMemory.owner());
     // Program destinaiton origin
     uint64_t dstOffset = dstOrigin[0];
-    setArgument(kernels_[kBlitType], 3, sizeof(dstOffset), &dstOffset);
+    setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, dstOffset, &dstMemory);
 
     uint64_t copySize = sizeIn[0];
-    setArgument(kernels_[kBlitType], 4, sizeof(copySize), &copySize);
+    setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
 
-    setArgument(kernels_[kBlitType], 5, sizeof(remainder), &remainder);
-    setArgument(kernels_[kBlitType], 6, sizeof(aligned_size), &aligned_size);
+    setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
+    setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
 
     // End pointer is the aligned copy size and destination offset
     uint64_t end_ptr = dstMemory.virtualAddress() + dstOffset + sizeIn[0] - remainder;
 
-    setArgument(kernels_[kBlitType], 7, sizeof(end_ptr), &end_ptr);
+    setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
 
     uint32_t next_chunk = globalWorkSize;
-    setArgument(kernels_[kBlitType], 8, sizeof(next_chunk), &next_chunk);
+    setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
 
     // Create ND range object for the kernel's execution
     amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index 0533a11f5..f8b4c1a28 100644
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -238,6 +238,8 @@ class DmaBlitManager : public device::HostBlitManager {
   amd::Context* context_;                     //!< A dummy context
   mutable size_t sdmaEngineRetainCount_;      //!< Keeps track of memcopies to either get the last
                                               //!< used SDMA engine or fetch the new mask
+  uint32_t sdmaEngineReadMask_;               //!< SDMA Engine Read Mask
+  uint32_t sdmaEngineWriteMask_;              //!< SDMA Engine Write Mask
 
  private:
   //! Disable copy constructor
diff --git a/rocclr/device/rocm/roccounters.cpp b/rocclr/device/rocm/roccounters.cpp
index c10ae9896..4b7b6c4ef 100644
--- a/rocclr/device/rocm/roccounters.cpp
+++ b/rocclr/device/rocm/roccounters.cpp
@@ -431,14 +431,6 @@ PerfCounter::PerfCounter(const Device& device,   //!< A ROC device object
 
   // these block indices are valid for the SI (Gfx8) & Gfx9 devices
   switch (roc_device_.isa().versionMajor()) {
-    case (8):
-      gfxVersion_ = ROC_GFX8;
-      if (blockIndex < viBlockIdOrcaToRocr.size()) {
-        auto p = viBlockIdOrcaToRocr[blockIndex];
-        event_.block_name = std::get<0>(p);
-        event_.block_index = std::get<1>(p);
-      }
-      break;
     case (9):
       gfxVersion_ = ROC_GFX9;
       if (blockIndex < gfx9BlockIdOrcaToRocr.size()) {
diff --git a/rocclr/device/rocm/roccounters.hpp b/rocclr/device/rocm/roccounters.hpp
index 494d3e07b..9d7ca002a 100644
--- a/rocclr/device/rocm/roccounters.hpp
+++ b/rocclr/device/rocm/roccounters.hpp
@@ -36,7 +36,6 @@ class PerfCounter : public device::PerfCounter {
  public:
   enum {
     ROC_UNSUPPORTED = 0,
-    ROC_GFX8,
     ROC_GFX9,
     ROC_GFX10
   };
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 5eee23e65..757a113fe 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -141,7 +141,7 @@ bool NullDevice::create(const amd::Isa &isa) {
   roc::Settings* hsaSettings = new roc::Settings();
   settings_ = hsaSettings;
   if (!hsaSettings ||
-      !hsaSettings->create(false, isa.versionMajor(), isa.versionMinor(), isa.versionStepping(),
+      !hsaSettings->create(false, isa,
                            isa.xnack() == amd::Isa::Feature::Enabled)) {
     LogPrintfError("Error creating settings for offline HSA device %s", isa.targetId());
     return false;
@@ -259,7 +259,7 @@ Device::~Device() {
     p2p_stage_ = nullptr;
   }
   if (nullptr != mg_sync_) {
-    amd::SvmBuffer::free(GlbCtx(), mg_sync_);
+    GlbCtx().svmFree(mg_sync_);
     mg_sync_ = nullptr;
   }
   if (glb_ctx_ != nullptr) {
@@ -503,11 +503,28 @@ bool Device::init() {
       if (end == std::string::npos) {
         end = ordinals.size();
       }
-      std::string strIndex = ordinals.substr(pos, end - pos);
-      int index = atoi(strIndex.c_str());
+      std::string str_id = ordinals.substr(pos, end - pos);
+      // If Uuid is specified, then convert it to index
+      // Uuid is an Ascii string with a maximum of 21 chars including NULL
+      // The string value is in the format GPU-<body>, <body> encodes UUID as a 16 chars hex
+      if (str_id.find("GPU-") != std::string::npos) {
+        for (int i = 0; i < gpu_agents_.size(); i++) {
+          auto agent = gpu_agents_[i];
+          char unique_id[32] = {0};
+          if (HSA_STATUS_SUCCESS ==
+            hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_UUID),
+                              unique_id)) {
+            if (std::string(unique_id).find(str_id) != std::string::npos) {
+              str_id = std::to_string(i);
+              break;
+            }
+          }
+        }
+      }
+      int index = atoi(str_id.c_str());
       if (index < 0 ||
           static_cast<size_t>(index) >= gpu_agents_.size() ||
-          strIndex != std::to_string(index)) {
+          str_id != std::to_string(index)) {
         deviceIdValid = false;
       }
 
@@ -602,9 +619,8 @@ bool Device::init() {
 
     // Allocate mgpu sync buffer for cooperative launches
     if (amd::IS_HIP) {
-      mg_sync_ = reinterpret_cast<address>(amd::SvmBuffer::malloc(
-          *glb_ctx_, (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS),
-          kMGInfoSizePerDevice * devices.size(), kMGInfoSizePerDevice));
+      mg_sync_ = reinterpret_cast<address>(glb_ctx_->svmAlloc(kMGInfoSizePerDevice * devices.size(),
+        kMGInfoSizePerDevice, (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS)));
       if (mg_sync_ == nullptr) {
         LogError("mgpu sync buffer alloc failed");
         return false;
@@ -718,23 +734,17 @@ bool Device::create() {
 
   info_.hdpMemFlushCntl = hdpInfo.HDP_MEM_FLUSH_CNTL;
   info_.hdpRegFlushCntl = hdpInfo.HDP_REG_FLUSH_CNTL;
-
-  bool device_kernel_args = true;
-  if (!isXgmi_ && ((info_.hdpMemFlushCntl == nullptr) || (info_.hdpRegFlushCntl == nullptr))) {
-    LogWarning("Unable to determine HDP flush register address. "
-      "Device kernel arguments are not supported");
-    device_kernel_args = false;
-  }
+  bool hasValidHDPFlush =
+      (info_.hdpMemFlushCntl != nullptr) && (info_.hdpRegFlushCntl != nullptr);
 
   // Create HSA settings
   assert(!settings_);
   roc::Settings* hsaSettings = new roc::Settings();
   settings_ = hsaSettings;
   if (!hsaSettings ||
-      !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), isa->versionMajor(),
-                           isa->versionMinor(), isa->versionStepping(),
+      !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
                            isa->xnack() == amd::Isa::Feature::Enabled,
-                           coop_groups, device_kernel_args)) {
+                           coop_groups, isXgmi_, hasValidHDPFlush)) {
     LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
                    pciDeviceId_);
     return false;
@@ -1332,6 +1342,19 @@ bool Device::populateOCLDeviceConstants() {
 
   info_.maxWorkItemDimensions_ = 3;
 
+  uint8_t memory_properties[8];
+  // Get the memory property from ROCr.
+  if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
+                              (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES,
+                              memory_properties)) {
+    LogError("HSA_AGENT_INFO_AMD_MEMORY_PROPERTIES query failed");
+  }
+
+  // Check if the device is APU
+  if (hsa_flag_isset64(memory_properties, HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU)) {
+    info_.hostUnifiedMemory_ = 1;
+  }
+
   if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
     size_t global_segment_size = 0;
     if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
@@ -1350,8 +1373,9 @@ bool Device::populateOCLDeviceConstants() {
         GPU_SINGLE_ALLOC_PERCENT = 75;
       }
     }
-    // Limit gpu single allocation percentage on MI300
-    if ((isa().versionMajor() == 9) && (isa().versionMinor() == 4)) {
+    // Limit gpu single allocation percentage for gfx940
+    if ((isa().versionMajor() == 9) && (isa().versionMinor() == 4) &&
+        (isa().versionStepping() == 0) && (info_.hostUnifiedMemory_ == 1)) {
       if (gpu_agents_.size() == 1 || p2p_agents_.size() == 0) {
         if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
             GPU_SINGLE_ALLOC_PERCENT = 60;
@@ -1433,7 +1457,7 @@ bool Device::populateOCLDeviceConstants() {
 
   if (agent_profile_ == HSA_PROFILE_FULL) {  // full-profile = participating in coherent memory,
                                              // base-profile = NUMA based non-coherent memory
-    info_.hostUnifiedMemory_ = true;
+    info_.hostUnifiedMemory_ = 1;
     info_.iommuv2_ = true;
   }
   info_.memBaseAddrAlign_ =
@@ -1847,19 +1871,6 @@ bool Device::populateOCLDeviceConstants() {
         std::numeric_limits<uint32_t>::max();  // gfx10+ does not share SGPRs between waves
   }
 
-  uint8_t memory_properties[8];
-  // Get the memory property from ROCr.
-  if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
-                              (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES,
-                              memory_properties)) {
-    LogError("HSA_AGENT_INFO_AMD_MEMORY_PROPERTIES query failed");
-  }
-
-  // Check if the device is APU
-  if (hsa_flag_isset64(memory_properties, HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU)) {
-    info_.hostUnifiedMemory_ = 1;
-  }
-
   return true;
 }
 
@@ -2274,7 +2285,7 @@ bool Device::deviceAllowAccess(void* ptr) const {
     hsa_status_t stat = hsa_amd_agents_allow_access(p2pAgents().size(),
                                                     p2pAgents().data(), nullptr, ptr);
     if (stat != HSA_STATUS_SUCCESS) {
-      LogError("Allow p2p access failed - hsa_amd_agents_allow_access");
+      LogPrintfError("Allow p2p access failed - hsa_amd_agents_allow_access with err %d", stat);
       return false;
     }
   }
@@ -2290,7 +2301,7 @@ bool Device::allowPeerAccess(device::Memory* memory) const {
     hsa_agent_t agent = getBackendDevice();
     hsa_status_t stat = hsa_amd_agents_allow_access(1, &agent, nullptr, ptr);
     if (stat != HSA_STATUS_SUCCESS) {
-      LogError("Allow p2p access failed - hsa_amd_agents_allow_access");
+      LogPrintfError("Allow p2p access failed - hsa_amd_agents_allow_access with err: %d", stat);
       return false;
     }
   }
@@ -2320,7 +2331,8 @@ void Device::deviceVmemRelease(uint64_t mem_handle) const {
   }
 }
 
-void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const {
+void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
+                               bool contiguous) const {
   const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_
                                       : (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_;
 
@@ -2330,6 +2342,11 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain
     return nullptr;
   }
 
+  uint32_t hsa_mem_flags = 0;
+  if (contiguous) {
+    hsa_mem_flags = HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG;
+  }
+
   void* ptr = nullptr;
   hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
   ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa device memory %p, size 0x%zx", ptr, size);
@@ -2400,8 +2417,8 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
       return nullptr;
     }
 
-    if (mem->getSvmPtr() != nullptr || mem->getMemFlags() & ROCCLR_MEM_PHYMEM) {
-      // add the information to context so that we can use it later.
+    // add the information to context so that we can use it later.
+    if (mem->getSvmPtr() != nullptr) {
       amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem);
     }
     svmPtr = mem->getSvmPtr();
@@ -2430,22 +2447,12 @@ void* Device::virtualAlloc(void* req_addr, size_t size, size_t alignment) {
     return nullptr;
   }
 
-  // This mem->create() does not create an actual memory but stores the memory info with given vptr.
-  auto mem = new (context()) amd::Buffer(context(), CL_MEM_VA_RANGE_AMD, size, vptr);
+  constexpr bool kParent = true;
+  amd::Memory* mem = CreateVirtualBuffer(context(), vptr, size, -1, kParent);
   if (mem == nullptr) {
-    LogError("failed to new a va range mem object!");
-    return nullptr;
+    LogPrintfError("Cannot create Virtual Buffer for vptr: %p of size: %u", vptr, size);
   }
 
-  if (!mem->create(nullptr, false)) {
-    LogError("failed to create a va range mem object");
-    mem->release();
-    return nullptr;
-  }
-
-  // Assert to make sure that amd::Memory object has set the right ptr.
-  guarantee(vptr == mem->getSvmPtr(), "amd::Memory object does not have the right ptr");
-
   return mem->getSvmPtr();
 }
 
@@ -2455,6 +2462,8 @@ void Device::virtualFree(void* addr) {
     LogPrintfError("Cannot find the Virtual MemObj entry for this addr 0x%x", addr);
   }
 
+  memObj->getContext().devices()[0]->DestroyVirtualBuffer(memObj);
+
   hsa_status_t hsa_status = hsa_amd_vmem_address_free(memObj->getSvmPtr(), memObj->getSize());
   if (hsa_status != HSA_STATUS_SUCCESS) {
     LogPrintfError("Failed hsa_amd_vmem_address_free. Failed with status:%d \n", hsa_status);
@@ -2498,6 +2507,50 @@ bool Device::GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) {
   return true;
 }
 
+// ================================================================================================
+bool Device::ExportShareableVMMHandle(uint64_t hsa_handle, int flags, void* shareableHandle) {
+  hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
+  hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {};
+
+  if (hsa_handle == 0) {
+    LogError("HSA Handle is not valid");
+    return false;
+  }
+
+  int dmabuf_fd = 0;
+  hsa_vmem_handle.handle = hsa_handle;
+  if ((hsa_status = hsa_amd_vmem_export_shareable_handle(&dmabuf_fd,
+                      hsa_vmem_handle, flags)) != HSA_STATUS_SUCCESS) {
+    LogPrintfError("Failed hsa_vmem_export_shareable_handle with status: %d \n", hsa_status);
+    return false;
+  }
+
+  *(reinterpret_cast<int*>(shareableHandle)) = dmabuf_fd;
+
+  return true;
+}
+
+// ================================================================================================
+bool Device::ImportShareableVMMHandle(void* osHandle, uint64_t* hsa_handle_ptr) const {
+  hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
+  hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {};
+
+  if (hsa_handle_ptr == nullptr) {
+    LogError("HSA Handle ptr is null");
+    return false;
+  }
+
+  int dmabuf_fd = *(reinterpret_cast<int*>(osHandle));
+  if ((hsa_status = hsa_amd_vmem_import_shareable_handle(dmabuf_fd, &hsa_vmem_handle))
+                      != HSA_STATUS_SUCCESS) {
+    LogPrintfError("Failed hsa_amd_vmem_import_shareable_handle with status: %d \n", hsa_status);
+    return false;
+  }
+
+  *hsa_handle_ptr = hsa_vmem_handle.handle;
+  return true;
+}
+
 // ================================================================================================
 bool Device::SetSvmAttributesInt(const void* dev_ptr, size_t count,
                               amd::MemoryAdvice advice, bool first_alloc, bool use_cpu) const {
@@ -3469,6 +3522,12 @@ uint32_t Device::fetchSDMAMask(const device::BlitManager* handle, bool readEngin
   return (readEngine ? maxSdmaReadMask_ : maxSdmaWriteMask_) & engine;
 }
 
+// ================================================================================================
+void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const {
+  *readMask = maxSdmaReadMask_;
+  *writeMask = maxSdmaWriteMask_;
+}
+
 // ================================================================================================
 void Device::resetSDMAMask(const device::BlitManager* handle) const {
   amd::ScopedLock lock(vgpusAccess());
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index eef891f2e..6a4597462 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -221,6 +221,7 @@ class NullDevice : public amd::Device {
     ShouldNotReachHere();
     return;
   }
+
   void* virtualAlloc(void* req_addr, size_t size, size_t alignment) override {
     ShouldNotReachHere();
     return nullptr;
@@ -452,7 +453,8 @@ class Device : public NullDevice {
   bool allowPeerAccess(device::Memory* memory) const;
   void deviceVmemRelease(uint64_t mem_handle) const;
   uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
-  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const;
+  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false,
+                         bool contiguous = false) const;
 
   void memFree(void* ptr, size_t size) const;
 
@@ -472,6 +474,10 @@ class Device : public NullDevice {
   virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags);
   virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr);
 
+  virtual bool ExportShareableVMMHandle(uint64_t hsa_handle, int flags, void* shareableHandle);
+
+  virtual bool ImportShareableVMMHandle(void* osHandle, uint64_t* hsa_handle_ptr) const;
+
   virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
@@ -583,7 +589,8 @@ class Device : public NullDevice {
   void HiddenHeapAlloc(const VirtualGPU& gpu);
 
   uint32_t fetchSDMAMask(const device::BlitManager* handle, bool readEngine = true) const;
-  void resetSDMAMask(const device::BlitManager* handle) const ;
+  void resetSDMAMask(const device::BlitManager* handle) const;
+  void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
   bool isXgmi() const { return isXgmi_; }
 
  private:
@@ -612,7 +619,7 @@ class Device : public NullDevice {
   mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
   hsa_agent_t bkendDevice_;
   uint32_t pciDeviceId_;
-  hsa_agent_t* p2p_agents_list_;
+  hsa_agent_t* p2p_agents_list_ = nullptr;
   hsa_profile_t agent_profile_;
   hsa_amd_memory_pool_t group_segment_;
   hsa_amd_memory_pool_t system_segment_;
diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp
index 611326234..8fba64ebc 100644
--- a/rocclr/device/rocm/rocmemory.cpp
+++ b/rocclr/device/rocm/rocmemory.cpp
@@ -769,13 +769,27 @@ bool Buffer::create(bool alloc_local) {
   cl_mem_flags memFlags = owner()->getMemFlags();
 
   if (memFlags & ROCCLR_MEM_PHYMEM) {
-    // If this is physical memory request, then get an handle and store it in user data
-    owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0);
+    if (memFlags & ROCCLR_MEM_INTERPROCESS) {
+      int dmabuf_fd = *(reinterpret_cast<int*>(owner()->getSvmPtr()));
+      // if interprocess flag is set, then the memory is importable.
+      if (!dev().ImportShareableVMMHandle(owner()->getSvmPtr(),
+            &owner()->getUserData().hsa_handle)) {
+        LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x \n",
+                        owner()->getSvmPtr());
+        return false;
+      }
+    } else {
+      // If this is physical memory request, then get an handle and store it in user data
+      owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0);
+    }
+
     if (owner()->getUserData().hsa_handle == 0) {
       LogError("HSA Opaque Handle returned was null");
       return false;
     }
-    deviceMemory_ = reinterpret_cast<void*>(amd::Memory::MemoryType::kPhyMemHandlePtr);
+
+    owner()->setSvmPtr(reinterpret_cast<void*>(owner()->getUserData().hsa_handle));
+
     return true;
   }
 
@@ -842,7 +856,8 @@ bool Buffer::create(bool alloc_local) {
       } else {
         assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
         deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0,
-                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0);
+                                               (memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0,
+                                               (memFlags & ROCCLR_MEM_HSA_CONTIGUOUS) != 0);
       }
       owner()->setSvmPtr(deviceMemory_);
     } else {
diff --git a/rocclr/device/rocm/rocprintf.cpp b/rocclr/device/rocm/rocprintf.cpp
index 224e106ce..efa665d32 100644
--- a/rocclr/device/rocm/rocprintf.cpp
+++ b/rocclr/device/rocm/rocprintf.cpp
@@ -135,17 +135,16 @@ static constexpr size_t ConstStr = 0xffffffff;
 static constexpr char Separator[] = ",\0";
 
 size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t size,
-                                 const uint32_t* argument) const {
+                                 const void* argument) const {
   // Serialize the output to the screen
   // amd::ScopedLock k(dev().lockAsyncOps());
-
   size_t copiedBytes = size;
   // Print the string argument, using standard PrintfDbg()
   if (checkString(fmt.c_str())) {
     // copiedBytes should be as number of printed chars
     copiedBytes = 0;
     //(null) should be printed
-    if (*argument == 0) {
+    if (*(reinterpret_cast<const unsigned char*>(argument)) == 0) {
       amd::Os::printf(fmt.data(), 0);
       // copiedBytes = strlen("(null)")
       copiedBytes = 6;
@@ -180,11 +179,9 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
       case 2:
       case 4:
         if (printFloat) {
-          uint32_t arg = *argument;
-          if (size == 2) {
-            auto p = reinterpret_cast<const uint16_t*>(argument);
-            amd::half2float(*p, &arg);
-          }
+          const float fArg = size == 2 ?
+                          amd::half2float(*(reinterpret_cast<const uint16_t *>(argument))) :
+                          *(reinterpret_cast<const float *>(argument));
           static const char* fSpecifiers = "eEfgGa";
           std::string fmtF = fmt;
           size_t posS = fmtF.find_first_of("%");
@@ -192,7 +189,6 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
           if (posS != std::string::npos && posE != std::string::npos) {
             fmtF.replace(posS + 1, posE - posS, "s");
           }
-          float fArg = *(reinterpret_cast<const float*>(&arg));
           float fSign = copysign(1.0, fArg);
           if (std::isinf(fArg) && !std::isnan(fArg)) {
             if (fSign < 0) {
@@ -223,9 +219,13 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
             hhFmt.erase(hhFmt.find_first_of("h"), 2);
             amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
           } else if (hlModifier) {
-            amd::Os::printf(hlFmt.data(), *argument);
+            amd::Os::printf(hlFmt.data(), size == 2 ?
+                *(reinterpret_cast<const uint16_t *>(argument)):
+                *(reinterpret_cast<const uint32_t *>(argument)));
           } else {
-            amd::Os::printf(fmt.data(), *argument);
+            amd::Os::printf(fmt.data(), size == 2 ?
+                *(reinterpret_cast<const uint16_t *>(argument)):
+                *(reinterpret_cast<const uint32_t *>(argument)));
           }
         }
         break;
@@ -295,13 +295,13 @@ void PrintfDbg::outputDbgBuffer(const device::PrintfInfo& info, const uint32_t*
           fmt = str.substr(pos, posEnd - pos);
           fmt.erase(posStart - pos - 1, 1);
           pos = posStart = posEnd;
-          outputArgument(sepStr, false, ConstStr, reinterpret_cast<const uint32_t*>(fmt.data()));
+          outputArgument(sepStr, false, ConstStr, fmt.data());
           continue;
         }
         break;
       } else if (pos < str.length()) {
         outputArgument(sepStr, false, ConstStr,
-                       reinterpret_cast<const uint32_t*>((str.substr(pos)).data()));
+                       str.substr(pos).data());
       }
     } while (posStart != std::string::npos);
 
diff --git a/rocclr/device/rocm/rocprintf.hpp b/rocclr/device/rocm/rocprintf.hpp
index 2945ee835..64d9f902d 100644
--- a/rocclr/device/rocm/rocprintf.hpp
+++ b/rocclr/device/rocm/rocprintf.hpp
@@ -103,7 +103,7 @@ class PrintfDbg : public amd::HeapObject {
   size_t outputArgument(const std::string& fmt,   //!< Format strint
                         bool printFloat,          //!< Argument is a float value
                         size_t size,              //!< Argument's size
-                        const uint32_t* argument  //!< Argument's location
+                        const void* argument      //!< Argument's location
                         ) const;
 
   //! Displays the PrintfDbg
diff --git a/rocclr/device/rocm/rocsettings.cpp b/rocclr/device/rocm/rocsettings.cpp
index a1688794a..c3c556633 100644
--- a/rocclr/device/rocm/rocsettings.cpp
+++ b/rocclr/device/rocm/rocsettings.cpp
@@ -95,15 +95,20 @@ Settings::Settings() {
   fgs_kernel_arg_ = false;
   barrier_value_packet_ = false;
 
-  device_kernel_args_ = false;
+  kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
   gwsInitSupported_ = true;
   limit_blit_wg_ = 16;
 }
 
 // ================================================================================================
-bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor,
-                      uint32_t gfxStepping, bool enableXNACK, bool coop_groups,
-                      bool device_kernel_args) {
+bool Settings::create(bool fullProfile, const amd::Isa& isa,
+                      bool enableXNACK, bool coop_groups, 
+                      bool isXgmi, bool hasValidHDPFlush) {
+
+  uint32_t gfxipMajor = isa.versionMajor();
+  uint32_t gfxipMinor = isa.versionMinor();
+  uint32_t gfxStepping = isa.versionStepping();
+
   customHostAllocator_ = false;
 
   if (fullProfile) {
@@ -166,11 +171,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
     barrier_value_packet_ = true;
   }
 
-  // Enable device kernel args for MI300* for now
-  if (gfxipMajor == 9 && gfxipMinor == 4 &&
-      (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) {
-    device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args;
-  }
+  setKernelArgImpl(isa, isXgmi, hasValidHDPFlush);
 
   if (gfxipMajor >= 10) {
      enableWave32Mode_ = true;
@@ -234,8 +235,47 @@ void Settings::override() {
     fgs_kernel_arg_ = ROC_USE_FGS_KERNARG;
   }
 
+  if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) {
+    kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT;
+  }
+}
+
+// ================================================================================================
+void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush) {
+
+  const uint32_t gfxipMajor = isa.versionMajor();
+  const uint32_t gfxipMinor = isa.versionMinor();
+  const uint32_t gfxStepping = isa.versionStepping();
+
+  const bool isMI300 = gfxipMajor == 9 && gfxipMinor == 4 &&
+      (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
+  const bool isMI200 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
+
+  auto kernelArgImpl = KernelArgImpl::HostKernelArgs;
+
+  if (isXgmi) {
+    // The XGMI-connected path does not require the manual memory ordering
+    // workarounds that the PCIe connected path requires
+    kernelArgImpl = KernelArgImpl::DeviceKernelArgs;
+  } else if (hasValidHDPFlush) {
+    // If the HDP flush register is valid implement the HDP flush to MMIO
+    kernelArgImpl = KernelArgImpl::DeviceKernelArgsHDP;
+  } else if (isMI300 || isMI200) {
+    // Implement the kernel argument readback workaround
+    // (write all args -> sfence -> write last byte -> mfence -> read last byte)
+    // It works only on MI200 and MI300 because of the strict guarantee on
+    // ordering of stores in those ASICS
+    kernelArgImpl = KernelArgImpl::DeviceKernelArgsReadback;
+  }
+
+  // Enable device kernel args for MI300* for now
+  if (isMI300) {
+    kernel_arg_impl_ = kernelArgImpl;
+    kernel_arg_opt_ = true;
+  }
+
   if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
-    device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
+    kernel_arg_impl_ = kernelArgImpl & (HIP_FORCE_DEV_KERNARG ? 0xF : 0x0);
   }
 }
 }  // namespace roc
diff --git a/rocclr/device/rocm/rocsettings.hpp b/rocclr/device/rocm/rocsettings.hpp
index 4b6e384c4..7a250bb7b 100644
--- a/rocclr/device/rocm/rocsettings.hpp
+++ b/rocclr/device/rocm/rocsettings.hpp
@@ -52,8 +52,7 @@ class Settings : public device::Settings {
       uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint device_kernel_args_ : 1;     //!< Allocate kernel args in device memory
-      uint reserved_ : 20;
+      uint reserved_ : 21;
     };
     uint value_;
   };
@@ -83,9 +82,9 @@ class Settings : public device::Settings {
   Settings();
 
   //! Creates settings
-  bool create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, uint32_t gfxStepping,
-              bool enableXNACK, bool coop_groups = false,
-              bool device_kernel_args = true);
+  bool create(bool fullProfile, const amd::Isa &isa, bool enableXNACK,
+              bool coop_groups = false, bool isXgmi = false,
+              bool hasValidHDPFlush = true);
 
  private:
   //! Disable copy constructor
@@ -96,6 +95,10 @@ class Settings : public device::Settings {
 
   //! Overrides current settings based on registry/environment
   void override();
+
+  //! Determine how kernel arguments should be implemented given ASIC (host
+  //! memory, device memory, device memory with memory ordering workaround)
+  void setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush);
 };
 
 /*@}*/} // namespace roc
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index fd9bb7faa..335c511d0 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -36,6 +36,7 @@
 #include "os/os.hpp"
 #include "hsa/amd_hsa_kernel_code.h"
 #include "hsa/amd_hsa_queue.h"
+#include "hsa/amd_hsa_signal.h"
 
 #include <fstream>
 #include <limits>
@@ -143,6 +144,8 @@ void Timestamp::checkGpuTime() {
       if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_
           || command().type() == CL_COMMAND_TASK) {
         hsa_amd_profiling_dispatch_time_t time = {};
+        amd_signal_t* amdSignal = reinterpret_cast<amd_signal_t*>(it->signal_.handle);
+
         if (it->engine_ == HwQueueEngine::Compute) {
           hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
         } else {
@@ -159,9 +162,10 @@ void Timestamp::checkGpuTime() {
           static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
         }
 
-        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
-          "end = %ld time taken= %ld ns", it->signal_.handle, time.start, time.end,
-          time.end - time.start);
+        ClPrint(amd::LOG_INFO, amd::LOG_TS, "Signal = (0x%lx), Translated start/end = %ld / %ld, "
+          "Elapsed = %ld ns, ticks start/end = %ld / %ld, Ticks elapsed = %ld", it->signal_.handle,
+          time.start, time.end, time.end - time.start, amdSignal->start_ts, amdSignal->end_ts,
+          amdSignal->end_ts - amdSignal->start_ts);
       }
       it->flags_.done_ = true;
     }
@@ -194,23 +198,25 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
       head = ts->command().GetBatchHead();
     }
     while (head != nullptr) {
-      if (head->data() != nullptr) {
-        Timestamp* headTs  = reinterpret_cast<Timestamp*>(head->data());
-        ts->setParsedCommand(head);
-        for (auto it : headTs->Signals()) {
-          hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
-          if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
-            hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
-                                 HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
-                                 &HsaAmdSignalHandler, ts);
-            if (HSA_STATUS_SUCCESS != result) {
-              LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
-            } else {
-              ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
-                      "handle(0x%lx)", static_cast<uint32_t>(val), headTs,
-                      headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
+      if (!head->data().empty()) {
+        for (auto i = 0; i < head->data().size(); i++) {
+          Timestamp* headTs  = reinterpret_cast<Timestamp*>(head->data()[i]);
+          ts->setParsedCommand(head);
+          for (auto it : headTs->Signals()) {
+            hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
+            if (int64_t val = hsa_signal_load_relaxed(it->signal_) > complete_val) {
+              hsa_status_t result = hsa_amd_signal_async_handler(headTs->Signals()[0]->signal_,
+                                  HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
+                                  &HsaAmdSignalHandler, ts);
+              if (HSA_STATUS_SUCCESS != result) {
+                LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
+              } else {
+                ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Requeue handler : value(%d), timestamp(%p),"
+                        "handle(0x%lx)", static_cast<uint32_t>(val), headTs,
+                        headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
+              }
+              return false;
             }
-            return false;
           }
         }
       }
@@ -349,6 +355,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
 // ================================================================================================
 VirtualGPU::HwQueueTracker::~HwQueueTracker() {
   for (auto& signal: signal_list_) {
+    CpuWaitForSignal(signal);
     signal->release();
   }
 }
@@ -356,9 +363,7 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
 // ================================================================================================
 bool VirtualGPU::HwQueueTracker::Create() {
   uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
-  if (activity_prof::IsEnabled(OP_ID_DISPATCH) || gpu_.profiling_) {
-    kSignalListSize = !flagIsDefault(ROC_SIGNAL_POOL_SIZE) ? ROC_SIGNAL_POOL_SIZE : 4 * Ki;
-  }
+
   signal_list_.resize(kSignalListSize);
 
   hsa_agent_t agent = gpu_.gpu_device();
@@ -832,13 +837,13 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
 // ================================================================================================
 template <typename AqlPacket>
 bool VirtualGPU::dispatchGenericAqlPacket(
-  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size) {
+  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) {
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
   const uint32_t sw_queue_size = queueMask;
 
   // Check for queue full and wait if needed.
-  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
+  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
   if (addSystemScope_) {
     header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
@@ -887,43 +892,38 @@ bool VirtualGPU::dispatchGenericAqlPacket(
     blocking = true;
   }
 
-  // Insert packet(s)
-  // NOTE: need multiple packets to dispatch the performance counter
-  //       packet blob of the legacy devices (gfx8)
-  for (uint i = 0; i < size; i++, index++, packet++) {
-    AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
-    *aql_loc = *packet;
-    if (header != 0) {
-      packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
-    }
-    ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
-            "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
-            "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
-            "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, "
-            "group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx "
-            "rptr=%u, wptr=%u",
-            gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
-            extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
-            extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
-            extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
-                           HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
-            extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
-                           HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
-            rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
-            reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal, read,
-            index);
-  }
-
-  hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);
+  AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
+  *aql_loc = *packet;
+  if (header != 0) {
+    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
+  }
+  ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
+          "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
+          "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
+          "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, "
+          "group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx "
+          "rptr=%u, wptr=%u",
+          gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
+          extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
+          extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
+          extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
+                          HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
+          extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                          HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+          rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
+          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal, read,
+          index);
+
+  hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
 
   // Mark the flag indicating if a dispatch is outstanding.
   // We are not waiting after every dispatch.
@@ -963,8 +963,7 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
   if (capturing == true) {
     packet->header = header;
     packet->setup = rest;
-    amd::Os::fastMemcpy(const_cast<uint8_t*>(aqlPacket), packet,
-                        sizeof(hsa_kernel_dispatch_packet_t));
+    std::memcpy(const_cast<uint8_t*>(aqlPacket), packet, sizeof(hsa_kernel_dispatch_packet_t));
     return true;
   } else {
     dispatchBlockingWait();
@@ -981,14 +980,13 @@ bool VirtualGPU::dispatchAqlPacket(
 inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd) {
   amd::ScopedLock lock(execution());
   if (vcmd != nullptr) {
-    profilingBegin(*vcmd, true, true);
+    profilingBegin(*vcmd, true);
   }
   dispatchBlockingWait();
-  constexpr size_t kPacketSize = 1;
   auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
-  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
+  dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
   if (vcmd != nullptr) {
-    profilingEnd(*vcmd, true);
+    profilingEnd(*vcmd);
   }
   return true;
 }
@@ -1003,13 +1001,6 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
   //  In GFX8 the PM4 IB packet blob is writing directly to AQL queue
   //  In GFX9 the PM4 IB is submitting by AQL Vendor Specific packet and
   switch (gfxVersion) {
-    case PerfCounter::ROC_GFX8:
-      { // Create legacy devices PM4 data
-        hsa_ext_amd_aql_pm4_packet_t pm4Packet[SLOT_PM4_SIZE_AQLP];
-        extApi->hsa_ven_amd_aqlprofile_legacy_get_pm4(packet, static_cast<void*>(&pm4Packet[0]));
-        return dispatchGenericAqlPacket(&pm4Packet[0], 0, 0, blocking, SLOT_PM4_SIZE_AQLP);
-      }
-      break;
     case PerfCounter::ROC_GFX9:
     case PerfCounter::ROC_GFX10:
       {
@@ -1376,7 +1367,8 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
   kernarg_pool_size_ = kernarg_pool_size;
   kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
   active_chunk_ = 0;
-  if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
+  if ((dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
+      roc_device_.info().largeBar_) {
     kernarg_pool_base_ =
       reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
   } else {
@@ -1455,23 +1447,18 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
-void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool useCommandTs) {
+void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
   if (command.profilingInfo().enabled_) {
     if (timestamp_ != nullptr) {
       LogWarning("Trying to create a second timestamp in VirtualGPU. \
                   This could have unintended consequences.");
       return;
     }
-    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
 
-    if (ts == nullptr) {
-      // Without barrier profiling will wait for each individual signal
-      timestamp_ = new Timestamp(this, command);
-      command.setData(timestamp_);
-      timestamp_->start();
-    } else {
-      timestamp_ = ts;
-    }
+    // Without barrier profiling will wait for each individual signal
+    timestamp_ = new Timestamp(this, command);
+    command.data().emplace_back(timestamp_);
+    timestamp_->start();
 
     // Enable SDMA profiling on the first access if profiling is set
     // Its not per command basis
@@ -1504,11 +1491,10 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling, bool
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
-void VirtualGPU::profilingEnd(amd::Command& command, bool useCommandTs) {
+void VirtualGPU::profilingEnd(amd::Command& command) {
   if (command.profilingInfo().enabled_) {
-    Timestamp* ts = useCommandTs ? reinterpret_cast<Timestamp*>(command.data()) : timestamp_;
-    if (ts->HwProfiling() == false) {
-      ts->end();
+    if (timestamp_->HwProfiling() == false) {
+      timestamp_->end();
     }
     timestamp_ = nullptr;
   }
@@ -1541,8 +1527,8 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
     // came before it to start and end with this first valid start time.
     current = list;
     while (current != nullptr) {
-      if (current->data() != nullptr) {
-        ts = reinterpret_cast<Timestamp*>(current->data());
+      if (!current->data().empty()) {
+        ts = reinterpret_cast<Timestamp*>(current->data().back());
         ts->getTime(&startTimeStamp, &endTimeStamp);
         break;
       }
@@ -1564,13 +1550,15 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
   current = list;
   while (current != nullptr) {
     if (current->profilingInfo().enabled_) {
-      if (current->data() != nullptr) {
-        // Since this is a valid command to get a timestamp, we use the
-        // timestamp provided by the runtime (saved in the data())
-        ts = reinterpret_cast<Timestamp*>(current->data());
-        ts->getTime(&startTimeStamp, &endTimeStamp);
-        ts->release();
-        current->setData(nullptr);
+      if (!current->data().empty()) {
+        for (auto i = 0; i < current->data().size(); i++) {
+          // Since this is a valid command to get a timestamp, we use the
+          // timestamp provided by the runtime (saved in the data())
+          ts = reinterpret_cast<Timestamp*>(current->data()[i]);
+          ts->getTime(&startTimeStamp, &endTimeStamp);
+          ts->release();
+        }
+        current->data().clear();
       } else {
         // If we don't have a command that contains a valid timestamp,
         // we simply use the end timestamp of the previous command.
@@ -2010,7 +1998,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
 
       // If these are from different contexts, then one of them could be in the device memory
       // This is fine, since spec doesn't allow for copies with pointers from different contexts
-      amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
+      std::memcpy(cmd.dst(), cmd.src(), cmd.srcSize());
       result = true;
     } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
       Memory* memory = dev().getRocMemory(dstMem);
@@ -2173,7 +2161,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
         // Wait on a kernel if one is outstanding
         releaseGpuMemoryFence();
         const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
-        amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
+        std::memcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
       }
     } else {
       LogError("Unhandled svm map!");
@@ -2204,7 +2192,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
         Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
 
         void* mappedPtr = hsaMapMemory->owner()->getHostMem();
-        amd::Os::fastMemcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
+        std::memcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
         // Target is a remote resource, so copy
         if (!blitMgr().copyBuffer(*hsaMapMemory, *memory, writeMapInfo->origin_,
                                   writeMapInfo->origin_, writeMapInfo->region_,
@@ -2292,7 +2280,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
         if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
           // Wait on a kernel if one is outstanding
           releaseGpuMemoryFence();
-          amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
+          std::memcpy(svmPtr, hostPtr, size[0]);
         }
       } else {
         result = blitMgr().readBuffer(*hsaMemory, static_cast<char*>(hostPtr) + origin[0], origin,
@@ -2392,7 +2380,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
           if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
             // Wait on a kernel if one is outstanding
             releaseGpuMemoryFence();
-            amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
+            std::memcpy(hostPtr, svmPtr, size[0]);
           }
           result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
                                         mapInfo->region_, mapInfo->isEntire());
@@ -2590,8 +2578,8 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
   profilingBegin(vcmd);
 
   // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
-  amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
-  if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
+  amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
+  if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
     profilingEnd(vcmd);
     return;
   }
@@ -2602,26 +2590,39 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
 
   // If Physical address is not set, then it is map command. If set, it is unmap command.
   if (phys_mem_obj != nullptr) {
+    constexpr bool kParent = false;
+    amd::Memory* vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer(
+                                 phys_mem_obj->getContext(), const_cast<void*>(vcmd.ptr()),
+                                 vcmd.size(), phys_mem_obj->getUserData().deviceId, kParent);
     // Map the physical to virtual address the hsa api
     hsa_amd_vmem_alloc_handle_t opaque_hsa_handle;
     opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle;
-    if ((hsa_status = hsa_amd_vmem_map(vaddr_mem_obj->getSvmPtr(), vcmd.size(),
-                        vaddr_mem_obj->getOffset(), opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
+    if ((hsa_status = hsa_amd_vmem_map(vaddr_sub_obj->getSvmPtr(), vcmd.size(),
+                        vaddr_sub_obj->getOffset(), opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
       assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
-      // Now that we have mapped physical addr to virtual addr, make an entry in the MemObjMap.
-      amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
-      vaddr_mem_obj->getUserData().phys_mem_obj = phys_mem_obj;
+      amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_sub_obj);
+      vaddr_sub_obj->getUserData().phys_mem_obj = phys_mem_obj;
+      phys_mem_obj->getUserData().vaddr_mem_obj = vaddr_sub_obj;
     } else {
       LogError("HSA Command: hsa_amd_vmem_map failed!");
     }
   } else {
+    dispatchBarrierPacket(kBarrierPacketHeader, false);
+    Barriers().WaitCurrent();
+
+    amd::Memory* vaddr_sub_obj = amd::MemObjMap::FindMemObj(vcmd.ptr());
+    assert(vaddr_sub_obj != nullptr);
+
     // Unmap the object, since the physical addr is set.
-    if ((hsa_status = hsa_amd_vmem_unmap(vaddr_mem_obj->getSvmPtr(), vcmd.size()))
+    if ((hsa_status = hsa_amd_vmem_unmap(vaddr_sub_obj->getSvmPtr(), vcmd.size()))
                         == HSA_STATUS_SUCCESS) {
       // assert the va is mapped and needs to be removed
-      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
+      vaddr_sub_obj->getContext().devices()[0]->DestroyVirtualBuffer(vaddr_sub_obj);
       amd::MemObjMap::RemoveMemObj(vcmd.ptr());
-      vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
+      if (vaddr_sub_obj->getUserData().phys_mem_obj != nullptr) {
+        vaddr_sub_obj->getUserData().phys_mem_obj->getUserData().vaddr_mem_obj = nullptr;
+        vaddr_sub_obj->getUserData().phys_mem_obj = nullptr;
+      }
     } else {
       LogError("HSA Command: hsa_amd_vmem_unmap failed");
     }
@@ -2944,7 +2945,7 @@ static inline void nontemporalMemcpy(
                     *reinterpret_cast<const int* __restrict&>(src)++);
   }
 #else
-  amd::Os::fastMemcpy(dst, src, size);
+  std::memcpy(dst, src, size);
 #endif
 }
 
@@ -3210,11 +3211,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       }
     }
 
-    const auto pcieKernargs = !dev().isXgmi() &&
-                              dev().settings().device_kernel_args_ &&
-                              roc_device_.info().largeBar_;
     address argBuffer = hidden_arguments;
     bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
+    size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());
 
     // Find all parameters for the current kernel
     if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
@@ -3222,16 +3221,27 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       if (isGraphCapture) {
         argBuffer = vcmd->getKernArgOffset();
       } else {
-        const auto kernargSize = gpuKernel.KernargSegmentByteSize();
-        argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
-                                              gpuKernel.KernargSegmentAlignment()));
+
+        argBuffer = reinterpret_cast<address>(
+            allocKernArg(gpuKernel.KernargSegmentByteSize(),
+                         gpuKernel.KernargSegmentAlignment()));
       }
-      // Load all kernel arguments
-      nontemporalMemcpy(argBuffer, parameters,
-                        std::min(gpuKernel.KernargSegmentByteSize(),
-                                 signature.paramsSize()));
-      if (pcieKernargs && !isGraphCapture) {
-        *dev().info().hdpMemFlushCntl = 1u;
+
+      nontemporalMemcpy(argBuffer, parameters, argSize);
+
+      if (roc_device_.info().largeBar_ && !isGraphCapture) {
+        const auto kernArgImpl = dev().settings().kernel_arg_impl_;
+
+        if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
+          *dev().info().hdpMemFlushCntl = 1u;
+          volatile auto kSentinel = *dev().info().hdpMemFlushCntl;
+        } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback &&
+                   argSize != 0) {
+          _mm_sfence();
+          *(argBuffer + argSize - 1) = *(parameters + argSize - 1);
+          _mm_mfence();
+          volatile auto kSentinel = *(argBuffer + argSize - 1);
+        }
       }
     }
 
@@ -3267,8 +3277,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     if ((devKernel->workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
       dispatchPacket.private_segment_size =
               std::max<uint64_t>(dev().StackSize(), dispatchPacket.private_segment_size);
-      if (dispatchPacket.private_segment_size > 16 * Ki) {
-        dispatchPacket.private_segment_size = 16 * Ki;
+      // This is the per-wave scratch limit for every platform except GFX12.
+      // See MAX_WAVE_SCRATCH constant in ROCM-Runtime:src/core/runtime/amd_gpu_agent.cpp
+      uint32_t maxWaveScratch = ((1 << 13) - 1) * Ki;
+      if (dispatchPacket.private_segment_size * dev().info().wavefrontWidth_ > maxWaveScratch) {
+        LogWarning("Requested kernel launch exceeded maximum per-thread scratch size.");
+        dispatchPacket.private_segment_size = maxWaveScratch / dev().info().wavefrontWidth_;
       }
     }
 
@@ -3293,11 +3307,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
                            (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
       aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     }
-    if (pcieKernargs && !isGraphCapture) {
-      if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
-        LogError("Unexpected HDP Register readback value!");
-      }
-    }
+
     if (vcmd == nullptr) {
       // Dispatch the packet
       if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
@@ -3323,8 +3333,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
 
   if (gpuKernel.dynamicParallelism()) {
     dispatchBarrierPacket(kBarrierPacketHeader, true);
-    static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
-        getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
+    if (virtualQueue_ != nullptr) {
+      static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
+          getVQVirtualAddress(), schedulerParam_, schedulerQueue_,
+          schedulerSignal_, schedulerThreads_);
+    }
   }
 
   // Check if image buffer write back is required
@@ -3461,14 +3474,13 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
 void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
-  profilingBegin(vcmd, true, true);
+  profilingBegin(vcmd, true);
 
   uint8_t* aqlPacket = vcmd.getLastPacket();
   if (aqlPacket != nullptr) {
     dispatchBlockingWait();
-    constexpr size_t kPacketSize = 1;
     auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
-    dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
+    dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
     // We need to set fence_dirty_ flag as we would use a dispatch packet with  a completion signal
     // to track graph finish for the last. The sync logic assumes HW event to a barrier packet that
     // has a system scope release. This would cause isFenceDirty() check at top level to insert
@@ -3483,7 +3495,7 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
     }
   }
 
-  profilingEnd(vcmd, true);
+  profilingEnd(vcmd);
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 25a571564..f9f86207d 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -306,8 +306,8 @@ class VirtualGPU : public device::VirtualDevice {
   bool create();
   const Device& dev() const { return roc_device_; }
 
-  void profilingBegin(amd::Command& command, bool sdmaProfiling = false, bool useCommandTs = false);
-  void profilingEnd(amd::Command& command, bool useCommandTs = false);
+  void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
+  void profilingEnd(amd::Command& command);
 
   void updateCommandsState(amd::Command* list) const;
 
@@ -433,8 +433,7 @@ class VirtualGPU : public device::VirtualDevice {
   bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
                         uint16_t rest, bool blocking = true);
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
-                                                              uint16_t rest, bool blocking,
-                                                              size_t size = 1);
+                                                              uint16_t rest, bool blocking);
 
   void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
                              hsa_signal_t signal = hsa_signal_t{0});
@@ -571,5 +570,7 @@ class VirtualGPU : public device::VirtualDevice {
   bool fence_dirty_;                    //!< Fence modified flag
 
   std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 }
diff --git a/rocclr/os/os.hpp b/rocclr/os/os.hpp
index c9bd0b99e..8509cc362 100644
--- a/rocclr/os/os.hpp
+++ b/rocclr/os/os.hpp
@@ -232,9 +232,6 @@ class Os : AllStatic {
   //! Deallocate an aligned chunk of memory.
   static void alignedFree(void* mem);
 
-  //! Platform-specific optimized memcpy()
-  static void* fastMemcpy(void* dest, const void* src, size_t n);
-
   //! NUMA related settings
   static void setPreferredNumaNode(uint32_t node);
 
diff --git a/rocclr/os/os_posix.cpp b/rocclr/os/os_posix.cpp
index 739795e0c..86c199b5b 100644
--- a/rocclr/os/os_posix.cpp
+++ b/rocclr/os/os_posix.cpp
@@ -524,7 +524,7 @@ int Os::systemCall(const std::string& command) {
 #if 1
   size_t len = command.size();
   char* cmd = new char[len + 1];
-  fastMemcpy(cmd, command.c_str(), len);
+  std::memcpy(cmd, command.c_str(), len);
   cmd[len] = 0;
 
   // Split the command into arguments. This is a very
@@ -681,8 +681,6 @@ uint64_t Os::xgetbv(uint32_t ecx) {
 }
 #endif  // ATI_ARCH_X86
 
-void* Os::fastMemcpy(void* dest, const void* src, size_t n) { return memcpy(dest, src, n); }
-
 uint64_t Os::offsetToEpochNanos() {
   static uint64_t offset = 0;
 
diff --git a/rocclr/os/os_win32.cpp b/rocclr/os/os_win32.cpp
index 3923ec37d..08ba2a21f 100644
--- a/rocclr/os/os_win32.cpp
+++ b/rocclr/os/os_win32.cpp
@@ -424,7 +424,7 @@ int Os::printf(const char* fmt, ...) {
 int Os::systemCall(const std::string& command) {
 #if 1
   char* cmd = new char[command.size() + 1];
-  fastMemcpy(cmd, command.c_str(), command.size());
+  std::memcpy(cmd, command.c_str(), command.size());
   cmd[command.size()] = 0;
 
   STARTUPINFO si = {0};
@@ -509,255 +509,6 @@ void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }
 
 uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }
 
-// Various "fast" memcpy implementation (currently win32 only due to compiler limitations)
-
-// (dgladdin - "recent" below means MMX and later)
-
-// Very optimized memcpy() routine for all AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
-// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetchnta instructions,
-// be sure youre running on Athlon/Duron or other recent CPU before calling!
-
-#define TINY_BLOCK_COPY 64  // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization.   This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 197 * 1024  // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, its faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE"
-
-#define BLOCK_PREFETCH_COPY infinity  // no limit for movq/movntq w/block prefetch
-#define CACHEBLOCK 80h                // number of 64-byte blocks (cache lines) for block prefetch
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-// Inline assembly syntax for use with Visual C++
-
-void* Os::fastMemcpy(void* dest, const void* src, size_t n) {
-#if !defined(_WIN64)
-
-  __asm {
-
-    mov     ecx, [n]        ; number of bytes to copy
-    mov     edi, [dest]     ; destination
-    mov     esi, [src]      ; source
-    mov     ebx, ecx        ; keep a copy of count
-
-    cld
-    cmp     ecx, TINY_BLOCK_COPY
-    jb      $memcpy_ic_3    ; tiny? skip mmx copy
-
-    cmp     ecx, 32*1024        ; dont align between 32k-64k because
-    jbe     $memcpy_do_align    ;  it appears to be slower
-    cmp     ecx, 64*1024
-    jbe     $memcpy_align_done
-$memcpy_do_align:
-    mov     ecx, 8          ; a trick thats faster than rep movsb...
-    sub     ecx, edi        ; align destination to qword
-    and     ecx, 111b       ; get the low bits
-    sub     ebx, ecx        ; update copy count
-    neg     ecx             ; set up to jump into the array
-    add     ecx, offset $memcpy_align_done
-    jmp     ecx             ; jump to array of movsbs
-
-align 4
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-
-$memcpy_align_done:         ; destination is dword aligned
-    mov     ecx, ebx        ; number of bytes left to copy
-    shr     ecx, 6          ; get 64-byte block count
-    jz      $memcpy_ic_2    ; finish the last few bytes
-
-    cmp     ecx, IN_CACHE_COPY/64    ; too big 4 cache? use uncached copy
-    jae     $memcpy_uc_test
-
-        // This is small block copy that uses the MMX registers to copy 8 bytes
-        // at a time.  It uses the "unrolled loop" optimization, and also uses
-        // the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:            ; 64-byte block copies, in-cache copy
-
-    prefetchnta [esi + (200*64/34+192)]        ; start reading ahead
-
-    movq    mm0, [esi+0]    ; read 64 bits
-    movq    mm1, [esi+8]
-    movq    [edi+0], mm0    ; write 64 bits
-    movq    [edi+8], mm1    ; note:  the normal movq writes the
-    movq    mm2, [esi+16]   ; data to cache; a cache line will be
-    movq    mm3, [esi+24]   ; allocated as needed, to store the data
-    movq    [edi+16], mm2
-    movq    [edi+24], mm3
-    movq    mm0, [esi+32]
-    movq    mm1, [esi+40]
-    movq    [edi+32], mm0
-    movq    [edi+40], mm1
-    movq    mm2, [esi+48]
-    movq    mm3, [esi+56]
-    movq    [edi+48], mm2
-    movq    [edi+56], mm3
-
-    add        esi, 64      ; update source pointer
-    add        edi, 64      ; update destination pointer
-    dec        ecx          ; count down
-    jnz        $memcpy_ic_1 ; last 64-byte block?
-
-$memcpy_ic_2:
-    mov        ecx, ebx     ; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-    shr        ecx, 2       ; dword count
-    and        ecx, 1111b   ; only look at the "remainder" bits
-    neg        ecx          ; set up to jump into the array
-    add        ecx, offset $memcpy_last_few
-    jmp        ecx          ; jump to array of movsds
-
-$memcpy_uc_test:
-    cmp        ecx, UNCACHED_COPY/64    ; big enough? use block prefetch copy
-    jae        $memcpy_bp_1
-
-$memcpy_64_test:
-    or        ecx, ecx      ; tail end of block prefetch will jump here
-    jz        $memcpy_ic_2  ; no more 64-byte blocks left
-
-        // For larger blocks, which will spill beyond the cache, its faster to
-        // use the Streaming Store instruction MOVNTQ.   This write instruction
-        // bypasses the cache and writes straight to main memory.  This code also
-        // uses the software prefetch instruction to pre-read the data.
-align 16
-$memcpy_uc_1:               ; 64-byte blocks, uncached copy
-
-    prefetchnta [esi + (200*64/34+192)]        ; start reading ahead
-
-    movq    mm0,[esi+0]     ; read 64 bits
-    add     edi,64          ; update destination pointer
-    movq    mm1,[esi+8]
-    add     esi,64          ; update source pointer
-    movq    mm2,[esi-48]
-    movntq  [edi-64], mm0   ; write 64 bits, bypassing the cache
-    movq    mm0,[esi-40]    ; note: movntq also prevents the CPU
-    movntq  [edi-56], mm1   ; from READING the destination address
-    movq    mm1,[esi-32]    ; into the cache, only to be over-written
-    movntq  [edi-48], mm2   ; so that also helps performance
-    movq    mm2,[esi-24]
-    movntq  [edi-40], mm0
-    movq    mm0,[esi-16]
-    movntq  [edi-32], mm1
-    movq    mm1,[esi-8]
-    movntq  [edi-24], mm2
-    movntq  [edi-16], mm0
-    dec     ecx
-    movntq  [edi-8], mm1
-    jnz     $memcpy_uc_1    ; last 64-byte block?
-
-    jmp     $memcpy_ic_2    ; almost done
-
-    // For the largest size blocks, a special technique called Block Prefetch
-    // can be used to accelerate the read operations.   Block Prefetch reads
-    // one address per cache line, for a series of cache lines, in a short loop.
-    // This is faster than using software prefetch, in this case.
-    // The technique is great for getting maximum read bandwidth,
-    // especially in DDR memory systems.
-$memcpy_bp_1:               ; large blocks, block prefetch copy
-
-    cmp     ecx, CACHEBLOCK ; big enough to run another prefetch loop?
-    jl      $memcpy_64_test ; no, back to regular uncached copy
-
-    mov     eax, CACHEBLOCK / 2  ; block prefetch loop, unrolled 2X
-    add     esi, CACHEBLOCK * 64 ; move to the top of the block
-align 16
-$memcpy_bp_2:
-    mov     edx, [esi-64]   ; grab one address per cache line
-    mov     edx, [esi-128]  ; grab one address per cache line
-    sub     esi, 128        ; go reverse order
-    dec     eax             ; count down the cache lines
-    jnz     $memcpy_bp_2    ; keep grabbing more lines into cache
-
-    mov     eax, CACHEBLOCK ; now that its in cache, do the copy
-align 16
-$memcpy_bp_3:
-    movq    mm0, [esi   ]   ; read 64 bits
-    movq    mm1, [esi+ 8]
-    movq    mm2, [esi+16]
-    movq    mm3, [esi+24]
-    movq    mm4, [esi+32]
-    movq    mm5, [esi+40]
-    movq    mm6, [esi+48]
-    movq    mm7, [esi+56]
-    add     esi, 64         ; update source pointer
-    movntq  [edi   ], mm0   ; write 64 bits, bypassing cache
-    movntq  [edi+ 8], mm1   ; note: movntq also prevents the CPU
-    movntq  [edi+16], mm2   ; from READING the destination address
-    movntq  [edi+24], mm3   ; into the cache, only to be over-written,
-    movntq  [edi+32], mm4   ; so that also helps performance
-    movntq  [edi+40], mm5
-    movntq  [edi+48], mm6
-    movntq  [edi+56], mm7
-    add     edi, 64         ; update dest pointer
-
-    dec     eax             ; count down
-
-    jnz     $memcpy_bp_3    ; keep copying
-    sub     ecx, CACHEBLOCK ; update the 64-byte block count
-    jmp     $memcpy_bp_1    ; keep processing chunks
-
-    // The smallest copy uses the X86 "movsd" instruction, in an optimized
-    // form which is an "unrolled loop".   Then it handles the last few bytes.
-align 4
-    movsd
-    movsd            ; perform last 1-15 dword copies
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd            ; perform last 1-7 dword copies
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-
-$memcpy_last_few:           ; dword aligned from before movsds
-    mov     ecx, ebx        ; has valid low 2 bits of the byte count
-    and     ecx, 11b        ; the last few cows must come home
-    jz      $memcpy_final   ; no more, lets leave
-    rep     movsb           ; the last 1, 2, or 3 bytes
-
-$memcpy_final:
-    emms                    ; clean up the MMX state
-    sfence                  ; flush the write buffer
-    mov     eax, [dest]     ; ret value = destination pointer
-
-  }
-#else  // !defined(_WIN64))
-
-  return memcpy(dest, src, n);
-
-#endif
-}
-
 uint64_t Os::offsetToEpochNanos() {
   static uint64_t offset = 0;
 
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
index 7e078c9a6..670c4980c 100644
--- a/rocclr/platform/command.cpp
+++ b/rocclr/platform/command.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
+/* Copyright (c) 2008 - 2024 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -18,14 +18,6 @@
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE. */
 
-/*!
- * \file command.cpp
- * \brief  Definitions for Event, Command and HostQueue objects.
- *
- * \author Laurent Morichetti
- * \date   October 2008
- */
-
 #include "platform/activity.hpp"
 #include "platform/command.hpp"
 #include "platform/commandqueue.hpp"
@@ -317,7 +309,6 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev
       queue_(&queue),
       next_(nullptr),
       type_(type),
-      data_(nullptr),
       waitingEvent_(waitingEvent),
       eventWaitList_(eventWaitList),
       commandWaitBits_(commandWaitBits) {
@@ -327,6 +318,18 @@ Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& ev
   }
 }
 
+SysmemPool<ComputeCommand> Command::command_pool_;
+
+// ================================================================================================
+void Command::operator delete(void* ptr) {
+  command_pool_.Free(ptr);
+}
+
+// ================================================================================================
+void* Command::operator new(size_t size) {
+  return command_pool_.Alloc(size);
+}
+
 // ================================================================================================
 void Command::releaseResources() {
   const Command::EventWaitList& events = eventWaitList();
@@ -371,13 +374,11 @@ void Command::enqueue() {
       // updated upon the marker completion
       SetBatchHead(queue_->GetSubmittionBatch());
 
-      setStatus(CL_SUBMITTED);
       submit(*queue_->vdev());
 
       // The batch will be tracked with the marker now
       queue_->ResetSubmissionBatch();
     } else {
-      setStatus(CL_SUBMITTED);
       submit(*queue_->vdev());
     }
   } else {
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index cbd837a2d..06fd4d81c 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
+/* Copyright (c) 2010 - 2024 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -18,13 +18,6 @@
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE. */
 
-/*! \file command.hpp
- *  \brief  Declarations for Event, Command and HostQueue objects.
- *
- *  \author Laurent Morichetti
- *  \date   October 2008
- */
-
 #ifndef COMMAND_HPP_
 #define COMMAND_HPP_
 
@@ -62,6 +55,7 @@ namespace amd {
 
 class Command;
 class HostQueue;
+union ComputeCommand;
 
 /*! \brief Encapsulates the status of a command.
  *
@@ -254,11 +248,12 @@ union CopyMetadata {
  */
 class Command : public Event {
  private:
+  static SysmemPool<ComputeCommand> command_pool_;  //!< Pool of active commands
   HostQueue* queue_;               //!< The command queue this command is enqueue into
   Command* next_;                  //!< Next GPU command in the queue list
   Command* batch_head_ = nullptr;  //!< The head of the batch commands
   cl_command_type type_;           //!< This command's OpenCL type.
-  void* data_;
+  std::vector<void*> data_;
   const Event* waitingEvent_;  //!< Waiting event associated with the marker
 
  protected:
@@ -282,7 +277,6 @@ class Command : public Event {
         queue_(nullptr),
         next_(nullptr),
         type_(type),
-        data_(nullptr),
         waitingEvent_(nullptr),
         eventWaitList_(nullWaitList),
         commandWaitBits_(0) {}
@@ -298,6 +292,10 @@ class Command : public Event {
   }
 
  public:
+  //! Overload new/delete for fast commands allocation/destruction
+  void* operator new(size_t size);
+  void operator delete(void* ptr);
+
   //! Return the queue this command is enqueued into.
   HostQueue* queue() const { return queue_; }
 
@@ -322,11 +320,9 @@ class Command : public Event {
   //! Return this command's OpenCL type.
   cl_command_type type() const { return type_; }
 
-  //! Return the opaque, device specific data for this command.
-  void* data() const { return data_; }
+  //! Return the opaque, device specific data vector for this command.
+  std::vector<void*>& data() { return data_; }
 
-  //! Set the opaque, device specific data for this command.
-  void setData(void* data) { data_ = data; }
 
   /*! \brief The execution engine for this command.
    *
@@ -1273,17 +1269,12 @@ class AccumulateCommand : public Command {
 
   //! Add kernel name to the list if available
   void addKernelName(const std::string& kernelName) {
-    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-      // "^" is to indicate kernel is captured at instantiate
-      kernelNames_.push_back("^  " + kernelName);
-    }
+    kernelNames_.push_back(kernelName);
   }
 
   //! Add kernel timestamp to the list if available
   void addTimestamps(uint64_t startTs, uint64_t endTs) {
-    if (activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-      tsList_.push_back(std::make_pair(startTs, endTs));
-    }
+    tsList_.push_back(std::make_pair(startTs, endTs));
   }
 
   //! Return the kernel names
@@ -1795,6 +1786,39 @@ class VirtualMapCommand : public Command {
   const void* ptr() const { return ptr_; }
 };
 
+//! Union used in memory suballocator, must be updated with the new commands
+union ComputeCommand {
+  ReadMemoryCommand             cmd0;
+  WriteMemoryCommand            cmd1;
+  FillMemoryCommand             cmd2;
+  CopyMemoryCommand             cmd3;
+  MapMemoryCommand              cmd4;
+  UnmapMemoryCommand            cmd5;
+  MigrateMemObjectsCommand      cmd6;
+  NDRangeKernelCommand          cmd7;
+  NativeFnCommand               cmd8;
+  ExternalSemaphoreCmd          cmd9;
+  Marker                        cmd10;
+  AccumulateCommand             cmd11;
+  AcquireExtObjectsCommand      cmd13;
+  ReleaseExtObjectsCommand      cmd14;
+  PerfCounterCommand            cmd15;
+  ThreadTraceMemObjectsCommand  cmd16;
+  ThreadTraceCommand            cmd17;
+  SignalCommand                 cmd18;
+  MakeBuffersResidentCommand    cmd19;
+  SvmFreeMemoryCommand          cmd20;
+  SvmCopyMemoryCommand          cmd21;
+  SvmFillMemoryCommand          cmd22;
+  SvmMapMemoryCommand           cmd23;
+  SvmUnmapMemoryCommand         cmd24;
+  CopyMemoryP2PCommand          cmd25;
+  SvmPrefetchAsyncCommand       cmd26;
+  VirtualMapCommand             cmd27;
+  ComputeCommand() {}
+  ~ComputeCommand() {}
+};
+
 /*! @}
  *  @}
  */
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 9375b4324..6dde23d92 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -59,11 +59,11 @@ HostQueue::HostQueue(Context& context, Device& device, cl_command_queue_properti
 bool HostQueue::terminate() {
   if (AMD_DIRECT_DISPATCH) {
     if (vdev() != nullptr) {
-      Command* marker = new Marker(*this, true);
-      if (marker != nullptr) {
-        marker->enqueue();
-        marker->awaitCompletion();
-        marker->release();
+      // If the queue still has the last command, then wait and release it
+      if (lastEnqueueCommand_ != nullptr) {
+        lastEnqueueCommand_->awaitCompletion();
+        lastEnqueueCommand_->release();
+        lastEnqueueCommand_ = nullptr;
       }
     }
     thread_.Release();
diff --git a/rocclr/platform/interop_gl.cpp b/rocclr/platform/interop_gl.cpp
index 342b034e6..20cf3a7d0 100644
--- a/rocclr/platform/interop_gl.cpp
+++ b/rocclr/platform/interop_gl.cpp
@@ -51,7 +51,8 @@
 bool amd::ClGlEvent::waitForFence() {
   GLenum ret;
   // get fence id associated with fence event
-  GLsync gs = reinterpret_cast<GLsync>(command().data());
+  GLsync gs = !command().data().empty() ? reinterpret_cast<GLsync>(command().data().back())
+                                          : nullptr;
   if (!gs) return false;
 
 // Try to use DC and GLRC of current thread, if it doesn't exist
diff --git a/rocclr/platform/memory.cpp b/rocclr/platform/memory.cpp
index 91933e404..1242ccd15 100644
--- a/rocclr/platform/memory.cpp
+++ b/rocclr/platform/memory.cpp
@@ -320,10 +320,7 @@ bool Memory::create(void* initFrom, bool sysMemAlloc, bool skipAlloc, bool force
       }
     }
   }
-  // Add a VA range into VA range map
-  if (getMemFlags() & CL_MEM_VA_RANGE_AMD) {
-    amd::MemObjMap::AddVirtualMemObj(getSvmPtr(), this);
-  }
+
   // Store the unique id for each memory allocation
   uniqueId_ = ++numAllocs;
   return true;
@@ -543,6 +540,11 @@ void Memory::uncommitSvmMemory() {
   }
 }
 
+Device* Memory::GetDeviceById() {
+  size_t device_idx = (userData_.deviceId < getContext().devices().size()) ? userData_.deviceId : 0;
+  return getContext().devices()[device_idx];
+}
+
 void Buffer::initDeviceMemory() {
   deviceMemories_ = reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Buffer));
   memset(deviceMemories_, 0, NumDevicesWithP2P() * sizeof(DeviceMemory));
@@ -1524,7 +1526,6 @@ bool SvmBuffer::Contains(uintptr_t ptr) {
 // The allocation flags are ignored for now.
 void* SvmBuffer::malloc(Context& context, cl_svm_mem_flags flags, size_t size, size_t alignment,
                         const amd::Device* curDev) {
-  bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0;
   void* ret = context.svmAlloc(size, alignment, flags, curDev);
   if (ret == nullptr) {
     LogError("Unable to allocate aligned memory");
diff --git a/rocclr/platform/memory.hpp b/rocclr/platform/memory.hpp
index afda26623..daa06308e 100644
--- a/rocclr/platform/memory.hpp
+++ b/rocclr/platform/memory.hpp
@@ -44,6 +44,7 @@
 #define ROCCLR_MEM_HSA_UNCACHED         (1u << 27)
 #define ROCCLR_MEM_INTERPROCESS         (1u << 26)
 #define ROCCLR_MEM_PHYMEM               (1u << 25)
+#define ROCCLR_MEM_HSA_CONTIGUOUS       (1u << 24)
 
 namespace device {
 class Memory;
@@ -143,7 +144,6 @@ class Memory : public amd::RuntimeObject {
   enum MemoryType {
     kSvmMemoryPtr = 0x1,
     kArenaMemoryPtr = 0x100,
-    kPhyMemHandlePtr = 0x101
   };
 
   struct UserData
@@ -151,6 +151,7 @@ class Memory : public amd::RuntimeObject {
      int deviceId = 0;     //!< Device ID memory is allocated on
      void* data = nullptr; //!< Opaque user data from CL or HIP or etc.
      amd::Memory* phys_mem_obj = nullptr; //<! Physical mem obj, only set on virtual mem
+     amd::Memory* vaddr_mem_obj = nullptr; //<! Virtual address mem obj, only set on virtual mem
      uint64_t hsa_handle = 0; //!<Opaque hsa handle saved for Virtual memories
      unsigned int flags = 0; //!< HIP memory flags
      //! hipMallocPitch allocates buffer using width & height and returns pitch & device pointer.
@@ -413,6 +414,9 @@ class Memory : public amd::RuntimeObject {
 
   //!find if memory object is Arena memory
   virtual bool isArena() { return false; }
+
+  //! get device by id when glb ctx is used.
+  Device* GetDeviceById();
 };
 
 //! Buffers are a specialization of memory. Just a wrapper, really,
diff --git a/rocclr/platform/object.hpp b/rocclr/platform/object.hpp
index 6d3e5b041..73e167c2c 100644
--- a/rocclr/platform/object.hpp
+++ b/rocclr/platform/object.hpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
+/* Copyright (c) 2008 - 2024 Advanced Micro Devices, Inc.
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -21,6 +21,8 @@
 #ifndef OBJECT_HPP_
 #define OBJECT_HPP_
 
+#include <set>
+
 #include "top.hpp"
 #include "os/alloc.hpp"
 #include "thread/monitor.hpp"
@@ -190,6 +192,80 @@ struct Coord3D {
   }
 };
 
+template <class T>
+class SysmemPool {
+public:
+  SysmemPool(): chunk_access_("Sysmem Pool Lock", true) {}
+  ~SysmemPool() {
+    // Release current chunk
+    if (chunks_.size() == 1) {
+      auto it = chunks_.begin();
+      auto idx = kAllocChunkSize - (current_alloc_.load() % kAllocChunkSize);
+      // Make sure all allocations were released
+      if (idx  == (*it)->free_) {
+        delete [] (*it)->allocs_;
+        delete (*it);
+        chunks_.erase(it);
+      }
+    }
+  }
+  void* Alloc(size_t size) {
+    guarantee(size <= sizeof(T), "Bigger size than pool allows!");
+    size_t current = current_alloc_++;
+    auto idx = current / kAllocChunkSize;
+    while (idx >= max_chunk_idx_) {
+      ScopedLock lock(chunk_access_);
+      // Second check in a case of multiple waiters
+      if (idx == max_chunk_idx_) {
+        auto allocs = new T[kAllocChunkSize];
+        chunks_.emplace(new AllocChunk(allocs));
+        active_allocs_[idx % kActiveAllocSize] = allocs;
+        max_chunk_idx_++;
+      }
+    }
+    return &active_allocs_[idx % kActiveAllocSize][current % kAllocChunkSize];
+  }
+
+  void Free(void* ptr) {
+    ScopedLock lock(chunk_access_);
+    bool found = false;
+    // Search for the pointer in all valid chunks
+    for (auto it : chunks_) {
+      if (reinterpret_cast<uintptr_t>(ptr) >= reinterpret_cast<uintptr_t>(it->allocs_) &&
+          reinterpret_cast<uintptr_t>(ptr) <
+          (reinterpret_cast<uintptr_t>(it->allocs_) + sizeof(T) * kAllocChunkSize)) {
+        it->free_--;
+        found = true;
+        // Destory current chunk if all allocations are freed
+        if (it->free_ == 0) {
+          delete [] it->allocs_;
+          delete it;
+          chunks_.erase(it);
+        }
+        break;
+      }
+    }
+    if (!found) {
+      guarantee(true, "Mempool releases incorrect memory!\n");
+    }
+  }
+
+private:
+  static constexpr size_t kAllocChunkSize = 1024;  //!< The total number of allocations in a chunk
+  static constexpr size_t kActiveAllocSize = 32;   //!< The number of active chunks
+  struct AllocChunk {
+    T*        allocs_;  //! Array of allocations
+    uint32_t  free_;    //! The number of commands still available for usage
+    AllocChunk(T* alloc): allocs_(alloc), free_(kAllocChunkSize) {}
+  };
+
+  std::atomic<uint64_t> current_alloc_ = 0; //!< Current allocation, global index
+  size_t        max_chunk_idx_ = 0;         //!< Current max chunk index
+  amd::Monitor  chunk_access_;              //!< Lock for the chunk list access
+  std::set<AllocChunk*> chunks_;            //!< List of allocated memory chunks
+  T* active_allocs_[kActiveAllocSize] = {}; //!< Active chunks for fast access
+};
+
 }  // namespace amd
 
 template <typename CL> typename amd::as_internal<CL>::type* as_amd(CL* cl_obj) {
diff --git a/rocclr/platform/runtime.cpp b/rocclr/platform/runtime.cpp
index 6072a62a9..571668cc7 100644
--- a/rocclr/platform/runtime.cpp
+++ b/rocclr/platform/runtime.cpp
@@ -100,11 +100,24 @@ void Runtime::tearDown() {
   initialized_ = false;
 }
 
-class RuntimeTearDown : public amd::HeapObject {
-public:
-  RuntimeTearDown() {}
-  ~RuntimeTearDown() { /*Runtime::tearDown();*/ }
-} runtime_tear_down;
+std::vector<ReferenceCountedObject*> RuntimeTearDown::external_;
+
+RuntimeTearDown::~RuntimeTearDown() {
+#ifndef _WIN32
+  if (amd::IS_HIP) {
+    for (auto it: external_) {
+      it->release();
+    }
+    Runtime::tearDown();
+  }
+#endif
+}
+
+void RuntimeTearDown::RegisterObject(ReferenceCountedObject* obj) {
+    external_.push_back(obj);
+}
+
+class RuntimeTearDown runtime_tear_down;
 
 uint ReferenceCountedObject::retain() {
   return referenceCount_.fetch_add(1, std::memory_order_relaxed) + 1;
@@ -120,12 +133,4 @@ uint ReferenceCountedObject::release() {
   return newCount;
 }
 
-#ifndef _WIN32
-void __attribute__((destructor)) hipTearDown() {
-  if (amd::IS_HIP) {
-    Runtime::tearDown();
-  }
-}
-#endif
-
 }  // namespace amd
diff --git a/rocclr/platform/runtime.hpp b/rocclr/platform/runtime.hpp
index f582e7032..d1953aeda 100644
--- a/rocclr/platform/runtime.hpp
+++ b/rocclr/platform/runtime.hpp
@@ -56,41 +56,20 @@ class Runtime : AllStatic {
   }
 };
 
-#if 0
-class HostThread : public Thread
-{
-private:
-    virtual void run(void* data) { ShouldNotCallThis(); }
-
-public:
-    HostThread() : Thread("HostThread", 0, false)
-    {
-        setHandle(NULL);
-        setCurrent();
-
-        if (!amd::Runtime::initialized() && !amd::Runtime::init()) {
-            return;
-        }
-
-        Os::currentStackInfo(&stackBase_, &stackSize_);
-        setState(RUNNABLE);
-    }
-
-    bool isHostThread() const { return true; };
-
-    static inline HostThread* current()
-    {
-        Thread* thread = Thread::current();
-        assert(thread->isHostThread() && "just checking");
-        return (HostThread*) thread;
-    }
-};
-#endif
-
 /*@}*/
 
 inline bool Runtime::initialized() { return initialized_; }
 
+class RuntimeTearDown : public HeapObject {
+  static std::vector<ReferenceCountedObject*> external_;
+
+public:
+  RuntimeTearDown() {}
+  ~RuntimeTearDown();
+
+  static void RegisterObject(ReferenceCountedObject* obj);
+};
+
 }  // namespace amd
 
 #endif /*RUNTIME_HPP_*/
diff --git a/rocclr/utils/debug.hpp b/rocclr/utils/debug.hpp
index c8c362568..1fa864c1e 100644
--- a/rocclr/utils/debug.hpp
+++ b/rocclr/utils/debug.hpp
@@ -57,6 +57,7 @@ enum LogMask {
   LOG_LOCATION  = 65536,  //!< (0x10000) Log message location
   LOG_MEM       = 131072, //!< (0x20000) Memory allocation
   LOG_MEM_POOL  = 262144, //!< (0x40000) Memory pool allocation, including memory in graphs
+  LOG_TS        = 524288, //!< (0x80000) Timestamp details
   LOG_ALWAYS    = -1      //!< (0xFFFFFFFF) Log always even mask flag is zero
 };
 
diff --git a/rocclr/utils/flags.cpp b/rocclr/utils/flags.cpp
index f19d26a6c..9ac9421b8 100644
--- a/rocclr/utils/flags.cpp
+++ b/rocclr/utils/flags.cpp
@@ -84,12 +84,6 @@ namespace amd {
 
 bool IS_HIP = false;
 
-#if !defined(_WIN32) && defined(WITH_PAL_DEVICE)
-bool IS_LEGACY = true;
-#else
-bool IS_LEGACY = false;
-#endif
-
 // static
 char* Flag::envstr_;
 
diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp
index 72f41dc69..5d894e498 100644
--- a/rocclr/utils/flags.hpp
+++ b/rocclr/utils/flags.hpp
@@ -194,11 +194,11 @@ release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true,                                  \
         "Enable system scope for signals (uses interrupts).")                 \
 release(bool, GPU_FORCE_QUEUE_PROFILING, false,                               \
         "Force command queue profiling by default")                           \
-release(bool, HIP_MEM_POOL_SUPPORT, IS_WINDOWS,                               \
+release(bool, HIP_MEM_POOL_SUPPORT, true,                                     \
         "Enables memory pool support in HIP")                                 \
-release(bool, HIP_MEM_POOL_USE_VM, IS_WINDOWS,                                \
+release(bool, HIP_MEM_POOL_USE_VM, true,                                      \
         "Enables memory pool support in HIP")                                 \
-release(bool, PAL_HIP_IPC_FLAG, true,                                        \
+release(bool, PAL_HIP_IPC_FLAG, true,                                         \
         "Enable interprocess flag for device allocation in PAL HIP")          \
 release(uint, PAL_FORCE_ASIC_REVISION, 0,                                     \
         "Force a specific asic revision for all devices")                     \
@@ -219,18 +219,16 @@ release(uint, ROC_P2P_SDMA_SIZE, 1024,                                        \
         "The minimum size in KB for P2P transfer with SDMA")                  \
 release(uint, ROC_AQL_QUEUE_SIZE, 16384,                                      \
         "AQL queue size in AQL packets")                                      \
-release(uint, ROC_SIGNAL_POOL_SIZE, 4096,                                     \
+release(uint, ROC_SIGNAL_POOL_SIZE, 64,                                       \
         "Initial size of HSA signal pool")                                    \
 release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16,                                    \
         "Limit the number of workgroups in blit operations")                  \
+release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false,                              \
+        "Enable blit kernel arguments optimization")                          \
 release(bool, ROC_SKIP_KERNEL_ARG_COPY, false,                                \
         "If true, then runtime can skip kernel arg copy")                     \
 release(bool, GPU_STREAMOPS_CP_WAIT, false,                                   \
         "Force the stream wait memory operation to wait on CP.")              \
-release(bool, HIP_USE_RUNTIME_UNBUNDLER, false,                               \
-        "Force this to use Runtime code object unbundler.")                   \
-release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false,                            \
-        "Set this to true to force runtime unbundler in hiprtc.")             \
 release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi,                                  \
         "Set initial heap size for device malloc.")                           \
 release(bool, HIP_FORCE_DEV_KERNARG, true,                                    \
diff --git a/rocclr/utils/util.hpp b/rocclr/utils/util.hpp
index 970c0a3ce..7bc0a5248 100644
--- a/rocclr/utils/util.hpp
+++ b/rocclr/utils/util.hpp
@@ -238,10 +238,9 @@ template <typename lambda> class ScopeGuard {
 #define MAKE_SCOPE_GUARD(name, ...)                                                                \
   MAKE_SCOPE_GUARD_HELPER(XCONCAT(scopeGuardLambda, __COUNTER__), name, __VA_ARGS__)
 
-
 // utility function to convert half precision to float to a
 // single precision value.
-inline void half2float(uint16_t Val, uint32_t *Res) {
+inline float half2float(const uint16_t Val) {
   constexpr uint32_t halfExpoentMask = 0x7c00;
   constexpr uint32_t halfFractionMask = 0x03ff;
   constexpr uint32_t floatExponentBias = 127;
@@ -252,17 +251,21 @@ inline void half2float(uint16_t Val, uint32_t *Res) {
   uint32_t exponent = (Val & halfExpoentMask) >> 10;
   uint32_t fraction = ((uint32_t)(Val & halfFractionMask))
                       << 13; // Aligning half fraction to float
+  union {
+    uint32_t u32Arg;
+    float fArg;
+  };
   // Handling special cases
   if (exponent == 0x1f) { // NaN or Infinity
     // When all exponent bits are 1, the value is either Infinity or NaN
     // For NaN, the fraction part should also be non-zero.
-    *Res = signBit | 0x7f800000 |
+    u32Arg = signBit | 0x7f800000 |
            fraction; // setting exponent to all 1's and keeping the fraction
-    return;
+    return fArg;
   } else if (exponent == 0) { // Subnormal numbers or zero
     if (fraction == 0) {
-      *Res = signBit; // Plus or minus zero
-      return;
+      u32Arg = signBit; // Plus or minus zero
+      return fArg;
     } else {
       // Normalize subnormal number
       while ((fraction & (1 << 23)) == 0) {
@@ -277,7 +280,8 @@ inline void half2float(uint16_t Val, uint32_t *Res) {
   uint32_t floatExponent =
       ((exponent + floatExponentBias - halfExponentBias) & 0xff)
       << floatExponentShift;
-  *Res = signBit | floatExponent | fraction;
+  u32Arg = signBit | floatExponent | fraction;
+  return fArg;
 }
 
 /*@}*/} // namespace amd