diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
index c7791ad29..61aefcdec 100644
--- a/src/core/inc/amd_gpu_agent.h
+++ b/src/core/inc/amd_gpu_agent.h
@@ -67,6 +67,7 @@ struct ScratchInfo {
   void* queue_base;
   size_t size;
   size_t size_per_thread;
+  uint32_t lanes_per_wave;
   ptrdiff_t queue_process_offset;
   bool large;
   bool retry;
diff --git a/src/core/inc/amd_gpu_pm4.h b/src/core/inc/amd_gpu_pm4.h
index b188682ac..008e5efa5 100644
--- a/src/core/inc/amd_gpu_pm4.h
+++ b/src/core/inc/amd_gpu_pm4.h
@@ -72,6 +72,12 @@
 #  define PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA  (1 << 29)
 #define PM4_ACQUIRE_MEM_DW2_COHER_SIZE(x)                  (((x) & 0xFFFFFFFF) << 0)
 #define PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(x)               (((x) & 0xFF) << 0)
+#define PM4_ACQUIRE_MEM_DW7_GCR_CNTL(x)                    (((x) & 0x7FFFF) << 0)
+#  define PM4_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x)              (((x) & 0x3) << 0)
+#  define PM4_ACQUIRE_MEM_GCR_CNTL_GLK_INV                 (1 << 7)
+#  define PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV                 (1 << 8)
+#  define PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV                 (1 << 9)
+#  define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV                 (1 << 14)
 
 #define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x)                 (((x) & 0xF) << 8)
 #  define PM4_RELEASE_MEM_EVENT_INDEX_AQL                  0x7
diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h
index 59ba7dc4a..089390ed6 100644
--- a/src/core/inc/amd_gpu_shaders.h
+++ b/src/core/inc/amd_gpu_shaders.h
@@ -127,6 +127,13 @@ static const unsigned int kCodeTrapHandler9[] = {
     .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 26
     .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
     .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000
+  .elseif .amdgcn.gfx_generation_number == 10
+    .set TTMP11_SAVE_REPLAY_W64H_SHIFT         , 31
+    .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 24
+    .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT      , 25
+    .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
+    .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000
+    .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK       , 0x2000000
   .else
     .error "unsupported target"
   .endif
@@ -217,6 +224,14 @@ static const unsigned int kCodeTrapHandler9[] = {
     s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
     s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
   .endif
+  .if .amdgcn.gfx_generation_number == 10
+    s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+    s_and_b32            ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+    s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
+    s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
+    s_or_b32             ttmp2, ttmp2, ttmp3
+    s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
+  .endif
 
     // Restore SQ_WAVE_STATUS.
     s_and_b64            exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
@@ -296,6 +311,77 @@ static const unsigned int kCodeFill8[] = {
     0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
 };
 
+static const unsigned int kCodeCopyAligned10[] = {
+    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
+    0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
+    0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02,
+    0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006,
+    0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E,
+    0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03,
+    0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05,
+    0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209,
+    0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04,
+    0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E,
+    0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
+    0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05,
+    0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02,
+    0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E,
+    0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E,
+    0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
+    0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05,
+    0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010,
+    0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05,
+    0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000,
+    0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000,
+};
+
+static const unsigned int kCodeCopyMisaligned10[] = {
+    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
+    0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002,
+    0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207,
+    0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102,
+    0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
+    0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
+    0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
+    0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
+    0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810,
+    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810,
+    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810,
+    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810,
+    0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008,
+    0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05,
+    0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
+    0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70,
+    0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105,
+    0xBF82FFEE, 0xBF810000,
+};
+
+static const unsigned int kCodeFill10[] = {
+    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602,
+    0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
+    0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03,
+    0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402,
+    0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B,
+    0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103,
+    0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402,
+    0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
+};
+
+static const unsigned int kCodeTrapHandler10[] = {
+    0xB96EF803, 0x8770FF6E, 0x10000100, 0xBF06FF70, 0x00000100, 0xBEF003FF,
+    0x20000000, 0xBF85000E, 0x8770FF6E, 0x00000800, 0xBEF003F4, 0xBF85000A,
+    0x93EEFF6D, 0x00080010, 0xBF84002C, 0xBF06826E, 0xBEF003FF, 0x80000000,
+    0xBF850003, 0x806C846C, 0x826D806D, 0xBF820025, 0xBEFE03FF, 0x80000000,
+    0xBF90000A, 0xBF800007, 0xBF0C9F7E, 0xBF84FFFD, 0x876EFF7E, 0x000003FF,
+    0x8F6E836E, 0xF4051BBD, 0xDC000000, 0xBF8CC07F, 0xF4051BB7, 0xFA0000C0,
+    0xBF8CC07F, 0xBEF10380, 0xF6811C37, 0xFA000008, 0xBF8CC07F, 0x88707170,
+    0xBF85000E, 0xF4051C37, 0xFA000010, 0xBF8CC07F, 0x87F07070, 0xBF840009,
+    0xF4011BB7, 0xFA000018, 0xBF8CC07F, 0xF4411BB8, 0xFA000000, 0xBF8CC07F,
+    0xBEFC0380, 0xBF800000, 0xBF900001, 0x8878FF78, 0x00002000, 0x906E8977,
+    0x876FFF6E, 0x003F8000, 0x906E8677, 0x876EFF6E, 0x02000000, 0x886E6F6E,
+    0xB9EEF807, 0x87FE7E7E, 0x87EA6A6A, 0xB9F8F802, 0xBE80226C,
+};
+
 }  // namespace amd
 
 #endif  // header guard
diff --git a/src/core/inc/registers.h b/src/core/inc/registers.h
index 39d86aecf..211ff5f9d 100644
--- a/src/core/inc/registers.h
+++ b/src/core/inc/registers.h
@@ -85,6 +85,10 @@ BUF_NUM_FORMAT_RESERVED_6__VI            = 0x00000006,
 BUF_NUM_FORMAT_FLOAT                     = 0x00000007,
 } BUF_NUM_FORMAT;
 
+typedef enum BUF_FORMAT {
+BUF_FORMAT_32_UINT                       = 0x00000014,
+} BUF_FORMAT;
+
 typedef enum SQ_SEL_XYZW01 {
 SQ_SEL_0                                 = 0x00000000,
 SQ_SEL_1                                 = 0x00000001,
@@ -201,4 +205,38 @@ SQ_SEL_W                                 = 0x00000007,
 	float	f32All;
 	};
 
+	union SQ_BUF_RSRC_WORD3_GFX10 {
+	struct {
+#if		defined(LITTLEENDIAN_CPU)
+                unsigned int                       DST_SEL_X : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                            TYPE : 2;
+#elif		defined(BIGENDIAN_CPU)
+                unsigned int                            TYPE : 2;
+                unsigned int                      OOB_SELECT : 2;
+                unsigned int                       RESERVED2 : 3;
+                unsigned int                  RESOURCE_LEVEL : 1;
+                unsigned int                  ADD_TID_ENABLE : 1;
+                unsigned int                    INDEX_STRIDE : 2;
+                unsigned int                       RESERVED1 : 2;
+                unsigned int                          FORMAT : 7;
+                unsigned int                       DST_SEL_W : 3;
+                unsigned int                       DST_SEL_Z : 3;
+                unsigned int                       DST_SEL_Y : 3;
+                unsigned int                       DST_SEL_X : 3;
+#endif
+	} bitfields, bits;
+	unsigned int	u32All;
+	signed int	i32All;
+	float	f32All;
+	};
 #endif  // header guard
diff --git a/src/core/inc/sdma_registers.h b/src/core/inc/sdma_registers.h
index 6e1a7fbf5..cf91cf002 100644
--- a/src/core/inc/sdma_registers.h
+++ b/src/core/inc/sdma_registers.h
@@ -295,7 +295,14 @@ typedef struct SDMA_PKT_FENCE_TAG {
     struct {
       unsigned int op : 8;
       unsigned int sub_op : 8;
-      unsigned int reserved_0 : 16;
+      unsigned int mtype : 3;
+      unsigned int gcc : 1;
+      unsigned int sys : 1;
+      unsigned int pad1 : 1;
+      unsigned int snp : 1;
+      unsigned int gpa : 1;
+      unsigned int l2_policy : 2;
+      unsigned int reserved_0 : 6;
     };
     unsigned int DW_0_DATA;
   } HEADER_UNION;
diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
index 3a8e68641..f2bdf85d3 100644
--- a/src/core/runtime/amd_aql_queue.cpp
+++ b/src/core/runtime/amd_aql_queue.cpp
@@ -749,7 +749,7 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
     }
 
     // Process only one queue error.
-    if (error_code == 1) {
+    if (error_code & 0x401) {  // insufficient scratch, wave64 or wave32
       // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
       auto& scratch = queue->queue_scratch_;
 
@@ -764,10 +764,11 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       uint32_t scratch_request = pkt.dispatch.private_segment_size;
 
       scratch.size_per_thread = scratch_request;
+      scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
       // Align whole waves to 1KB.
-      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
+      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
       scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
-          queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
+          queue->agent_->properties().MaxSlotsScratchCU * scratch.lanes_per_wave;
 
       queue->agent_->AcquireQueueScratch(scratch);
 
@@ -948,7 +949,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b) {
     rel_mem[4] = 0;
     rel_mem[5] = 0;
     rel_mem[6] = 0;
-  } else if (agent_->isa()->GetMajorVersion() == 9) {
+  } else if (agent_->isa()->GetMajorVersion() >= 9) {
     // Construct an AQL packet to jump to the PM4 IB.
     struct amd_aql_pm4_ib {
       uint16_t header;
@@ -1001,7 +1002,7 @@ void AqlQueue::InitScratchSRD() {
   SQ_BUF_RSRC_WORD0 srd0;
   SQ_BUF_RSRC_WORD1 srd1;
   SQ_BUF_RSRC_WORD2 srd2;
-  SQ_BUF_RSRC_WORD3 srd3;
+  uint32_t srd3_u32;
 
   uint32_t scratch_base_hi = 0;
   uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base);
@@ -1017,33 +1018,60 @@ void AqlQueue::InitScratchSRD() {
 
   srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
 
-  srd3.bits.DST_SEL_X = SQ_SEL_X;
-  srd3.bits.DST_SEL_Y = SQ_SEL_Y;
-  srd3.bits.DST_SEL_Z = SQ_SEL_Z;
-  srd3.bits.DST_SEL_W = SQ_SEL_W;
-  srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
-  srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
-  srd3.bits.ELEMENT_SIZE = 1;  // 4
-  srd3.bits.INDEX_STRIDE = 3;  // 64
-  srd3.bits.ADD_TID_ENABLE = 1;
-  srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
-  srd3.bits.HASH_ENABLE = 0;
-  srd3.bits.HEAP = 0;
-  srd3.bits.MTYPE__CI__VI = 0;
-  srd3.bits.TYPE = SQ_RSRC_BUF;
+  if (agent_->isa()->GetMajorVersion() < 10) {
+    SQ_BUF_RSRC_WORD3 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
+    srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
+    srd3.bits.ELEMENT_SIZE = 1;  // 4
+    srd3.bits.INDEX_STRIDE = 3;  // 64
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.ATC__CI__VI = (agent_->profile() == HSA_PROFILE_FULL);
+    srd3.bits.HASH_ENABLE = 0;
+    srd3.bits.HEAP = 0;
+    srd3.bits.MTYPE__CI__VI = 0;
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  } else {
+    SQ_BUF_RSRC_WORD3_GFX10 srd3;
+
+    srd3.bits.DST_SEL_X = SQ_SEL_X;
+    srd3.bits.DST_SEL_Y = SQ_SEL_Y;
+    srd3.bits.DST_SEL_Z = SQ_SEL_Z;
+    srd3.bits.DST_SEL_W = SQ_SEL_W;
+    srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
+    srd3.bits.RESERVED1 = 0;
+    srd3.bits.INDEX_STRIDE = 0;  // filled in by CP
+    srd3.bits.ADD_TID_ENABLE = 1;
+    srd3.bits.RESOURCE_LEVEL = 1;
+    srd3.bits.RESERVED2 = 0;
+    srd3.bits.OOB_SELECT = 2;  // no bounds check in swizzle mode
+    srd3.bits.TYPE = SQ_RSRC_BUF;
+
+    srd3_u32 = srd3.u32All;
+  }
 
   // Update Queue's Scratch descriptor's property
   amd_queue_.scratch_resource_descriptor[0] = srd0.u32All;
   amd_queue_.scratch_resource_descriptor[1] = srd1.u32All;
   amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
-  amd_queue_.scratch_resource_descriptor[3] = srd3.u32All;
+  amd_queue_.scratch_resource_descriptor[3] = srd3_u32;
 
   // Populate flat scratch parameters in amd_queue_.
   amd_queue_.scratch_backing_memory_location =
       queue_scratch_.queue_process_offset;
   amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
-  amd_queue_.scratch_workitem_byte_size =
-      uint32_t(queue_scratch_.size_per_thread);
+
+  // For backwards compatibility this field records the per-lane scratch
+  // for a 64 lane wavefront. If scratch was allocated for 32 lane waves
+  // then the effective size for a 64 lane wave is halved.
+  amd_queue_.scratch_wave64_lane_byte_size =
+      uint32_t((queue_scratch_.size_per_thread * queue_scratch_.lanes_per_wave) / 64);
 
   // Set concurrent wavefront limits only when scratch is being used.
   COMPUTE_TMPRING_SIZE tmpring_size = {};
@@ -1059,8 +1087,8 @@ void AqlQueue::InitScratchSRD() {
 
   // Scratch is allocated program COMPUTE_TMPRING_SIZE register
   // Scratch Size per Wave is specified in terms of kilobytes
-  uint32_t wave_size = agent_props.WaveFrontSize;
-  uint32_t wave_scratch = (((wave_size * queue_scratch_.size_per_thread) + 1023) / 1024);
+  uint32_t wave_scratch = (((queue_scratch_.lanes_per_wave *
+                               queue_scratch_.size_per_thread) + 1023) / 1024);
   tmpring_size.bits.WAVESIZE = wave_scratch;
   assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
   uint32_t num_waves = (queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024));
diff --git a/src/core/runtime/amd_blit_sdma.cpp b/src/core/runtime/amd_blit_sdma.cpp
index 2a189de89..188345e2d 100644
--- a/src/core/runtime/amd_blit_sdma.cpp
+++ b/src/core/runtime/amd_blit_sdma.cpp
@@ -142,7 +142,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
   }
 
   // HDP flush supported on gfx900 and forward.
-  if (agent_->isa()->GetMajorVersion() > 8) {
+  // FIXME: Not working on gfx10, raises SRBM write protection interrupt.
+  if (agent_->isa()->GetMajorVersion() == 9) {
     hdp_flush_support_ = true;
   }
 
@@ -623,6 +624,10 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceComma
 
   packet_addr->HEADER_UNION.op = SDMA_OP_FENCE;
 
+  if (agent_->isa()->GetMajorVersion() >= 10) {
+    packet_addr->HEADER_UNION.mtype = 3;
+  }
+
   packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(fence);
 
   packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(fence);
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index 5b27486dd..08950dda9 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -173,6 +173,7 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
     ASICShader compute_7;
     ASICShader compute_8;
     ASICShader compute_9;
+    ASICShader compute_10;
   };
 
   std::map<std::string, CompiledShader> compiled_shaders = {
@@ -181,24 +182,28 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
            {NULL, 0, 0, 0},
            {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
            {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
+           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},
        }},
       {"CopyAligned",
        {
            {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
            {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
            {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
        }},
       {"CopyMisaligned",
        {
            {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
            {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
            {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
        }},
       {"Fill",
        {
            {kCodeFill7, sizeof(kCodeFill7), 19, 8},
            {kCodeFill8, sizeof(kCodeFill8), 19, 8},
            {kCodeFill8, sizeof(kCodeFill8), 19, 8},
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},
        }}};
 
   auto compiled_shader_it = compiled_shaders.find(func_name);
@@ -217,6 +222,9 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
     case 9:
       asic_shader = &compiled_shader_it->second.compute_9;
       break;
+    case 10:
+      asic_shader = &compiled_shader_it->second.compute_10;
+      break;
     default:
       assert(false && "Precompiled shader unavailable for target");
   }
@@ -886,13 +894,22 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
   // Allocate scratch memory
   ScratchInfo scratch;
   if (private_segment_size == UINT_MAX) {
-    private_segment_size = 0;
+    private_segment_size = (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_;
+  }
+
+  if (private_segment_size > 262128) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  scratch.lanes_per_wave = 64;
+  scratch.size_per_thread = AlignUp(private_segment_size, 1024 / scratch.lanes_per_wave);
+  if (scratch.size_per_thread > 262128) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
   scratch.size_per_thread = private_segment_size;
 
   const uint32_t num_cu = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
-  scratch.size =
-      scratch.size_per_thread * properties_.MaxSlotsScratchCU * properties_.WaveFrontSize * num_cu;
+  scratch.size = scratch.size_per_thread * 32 * scratch.lanes_per_wave * num_cu;
   scratch.queue_base = nullptr;
   scratch.queue_process_offset = 0;
 
@@ -941,7 +958,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
   // Limit to 1/8th of scratch pool for small scratch and 1/4 of that for a single queue.
   size_t small_limit = scratch_pool_.size() >> 3;
-  size_t single_limit = small_limit >> 2;
+  // Lift limit for 2.10 release RCCL workaround.
+  size_t single_limit = 146800640; //small_limit >> 2;
   bool large = (scratch.size > single_limit) ||
       (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit);
   large = (isa_->GetMajorVersion() < 8) ? false : large;
@@ -1186,29 +1204,40 @@ void GpuAgent::InvalidateCodeCaches() {
       // Microcode is handling code cache invalidation.
       return;
     }
-  } else if (isa_->GetMajorVersion() > 9) {
+  } else if (isa_->GetMajorVersion() > 10) {
     assert(false && "Code cache invalidation not implemented for this agent");
   }
 
   // Invalidate caches which may hold lines of code object allocation.
-  constexpr uint32_t cache_inv_size_dw = 7;
-  uint32_t cache_inv[cache_inv_size_dw];
+  uint32_t cache_inv[8] = {0};
+  uint32_t cache_inv_size_dw;
+
+  if (isa_->GetMajorVersion() < 10) {
+      cache_inv[1] = PM4_ACQUIRE_MEM_DW1_COHER_CNTL(
+          PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA |
+          PM4_ACQUIRE_MEM_COHER_CNTL_SH_KCACHE_ACTION_ENA |
+          PM4_ACQUIRE_MEM_COHER_CNTL_TC_ACTION_ENA |
+          PM4_ACQUIRE_MEM_COHER_CNTL_TC_WB_ACTION_ENA);
+
+      cache_inv_size_dw = 7;
+  } else {
+      cache_inv[7] = PM4_ACQUIRE_MEM_DW7_GCR_CNTL(
+          PM4_ACQUIRE_MEM_GCR_CNTL_GLI_INV(1) |
+          PM4_ACQUIRE_MEM_GCR_CNTL_GLK_INV |
+          PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV |
+          PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV |
+          PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV);
+
+      cache_inv_size_dw = 8;
+  }
 
   cache_inv[0] = PM4_HDR(PM4_HDR_IT_OPCODE_ACQUIRE_MEM, cache_inv_size_dw,
-                         isa_->GetMajorVersion());
-  cache_inv[1] = PM4_ACQUIRE_MEM_DW1_COHER_CNTL(
-      PM4_ACQUIRE_MEM_COHER_CNTL_SH_ICACHE_ACTION_ENA |
-      PM4_ACQUIRE_MEM_COHER_CNTL_SH_KCACHE_ACTION_ENA |
-      PM4_ACQUIRE_MEM_COHER_CNTL_TC_ACTION_ENA |
-      PM4_ACQUIRE_MEM_COHER_CNTL_TC_WB_ACTION_ENA);
+             isa_->GetMajorVersion());
   cache_inv[2] = PM4_ACQUIRE_MEM_DW2_COHER_SIZE(0xFFFFFFFF);
   cache_inv[3] = PM4_ACQUIRE_MEM_DW3_COHER_SIZE_HI(0xFF);
-  cache_inv[4] = 0;
-  cache_inv[5] = 0;
-  cache_inv[6] = 0;
 
   // Submit the command to the utility queue and wait for it to complete.
-  queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
+  queues_[QueueUtility]->ExecutePM4(cache_inv, cache_inv_size_dw * sizeof(uint32_t));
 }
 
 lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp
index bbb599db4..bb8e75881 100644
--- a/src/core/runtime/hsa.cpp
+++ b/src/core/runtime/hsa.cpp
@@ -1887,6 +1887,12 @@ static std::string ConvertOldTargetNameToNew(
     NewName = "amdgcn-amd-amdhsa--gfx906";
   else if (OldName == "AMD:AMDGPU:9:0:8")
     NewName = "amdgcn-amd-amdhsa--gfx908";
+  else if (OldName == "AMD:AMDGPU:10:1:0")
+    NewName = "amdgcn-amd-amdhsa--gfx1010";
+  else if (OldName == "AMD:AMDGPU:10:1:1")
+    NewName = "amdgcn-amd-amdhsa--gfx1011";
+  else if (OldName == "AMD:AMDGPU:10:1:2")
+    NewName = "amdgcn-amd-amdhsa--gfx1012";
   else
     assert(false && "Unhandled target");
 
diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp
index 77631a8fe..2fa7d2872 100755
--- a/src/core/runtime/isa.cpp
+++ b/src/core/runtime/isa.cpp
@@ -218,6 +218,9 @@ const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() {
   ISAREG_ENTRY_GEN(9, 0, 6, false, true )
   ISAREG_ENTRY_GEN(9, 0, 8, false, true )
   ISAREG_ENTRY_GEN(9, 0, 8, false, false)
+  ISAREG_ENTRY_GEN(10, 1, 0, false, false)
+  ISAREG_ENTRY_GEN(10, 1, 1, false, false)
+  ISAREG_ENTRY_GEN(10, 1, 2, false, false)
 
   return supported_isas;
 }
diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h
index edee52605..4d9dba5b8 100644
--- a/src/inc/amd_hsa_elf.h
+++ b/src/inc/amd_hsa_elf.h
@@ -68,6 +68,9 @@
 #define EF_AMDGPU_MACH_AMDGCN_GFX906_LC 0x02f
 #define EF_AMDGPU_MACH_AMDGCN_GFX908_LC 0x030
 #define EF_AMDGPU_MACH_AMDGCN_GFX909_LC 0x031
+#define EF_AMDGPU_MACH_AMDGCN_GFX1010_LC 0x033
+#define EF_AMDGPU_MACH_AMDGCN_GFX1011_LC 0x034
+#define EF_AMDGPU_MACH_AMDGCN_GFX1012_LC 0x035
 #define EF_AMDGPU_XNACK_LC 0x100
 #define EF_AMDGPU_SRAM_ECC_LC 0x200
 
diff --git a/src/inc/amd_hsa_queue.h b/src/inc/amd_hsa_queue.h
index 2176e8470..2da98964d 100644
--- a/src/inc/amd_hsa_queue.h
+++ b/src/inc/amd_hsa_queue.h
@@ -77,7 +77,7 @@ typedef struct AMD_QUEUE_ALIGN amd_queue_s {
   uint32_t scratch_resource_descriptor[4];
   uint64_t scratch_backing_memory_location;
   uint64_t scratch_backing_memory_byte_size;
-  uint32_t scratch_workitem_byte_size;
+  uint32_t scratch_wave64_lane_byte_size;
   amd_queue_properties32_t queue_properties;
   uint32_t reserved3[2];
   hsa_signal_t queue_inactive_signal;
diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp
index 8bec5b2d5..7a133ff9c 100644
--- a/src/libamdhsacode/amd_hsa_code.cpp
+++ b/src/libamdhsacode/amd_hsa_code.cpp
@@ -579,6 +579,12 @@ namespace code {
         NewName = "amdgcn-amd-amdhsa--gfx906";
       else if (OldName == "AMD:AMDGPU:9:0:8")
         NewName = "amdgcn-amd-amdhsa--gfx908";
+      else if (OldName == "AMD:AMDGPU:10:1:0")
+        NewName = "amdgcn-amd-amdhsa--gfx1010";
+      else if (OldName == "AMD:AMDGPU:10:1:1")
+        NewName = "amdgcn-amd-amdhsa--gfx1011";
+      else if (OldName == "AMD:AMDGPU:10:1:2")
+        NewName = "amdgcn-amd-amdhsa--gfx1012";
       else
         assert(false && "Unhandled target");
 
@@ -631,6 +637,9 @@ namespace code {
         case EF_AMDGPU_MACH_AMDGCN_GFX904_LC: isaName += "gfx904"; break;
         case EF_AMDGPU_MACH_AMDGCN_GFX906_LC: isaName += "gfx906"; break;
         case EF_AMDGPU_MACH_AMDGCN_GFX908_LC: isaName += "gfx908"; break;
+        case EF_AMDGPU_MACH_AMDGCN_GFX1010_LC: isaName += "gfx1010"; break;
+        case EF_AMDGPU_MACH_AMDGCN_GFX1011_LC: isaName += "gfx1011"; break;
+        case EF_AMDGPU_MACH_AMDGCN_GFX1012_LC: isaName += "gfx1012"; break;
         default: return false;
         }
 
@@ -1348,6 +1357,8 @@ namespace code {
           asic = "VI";
         } else if (major_version == 9) {
           asic = "GFX9";
+        } else if (major_version == 10) {
+          asic = "GFX10";
         } else {
           assert(!"unknown compute capability");
         }
diff --git a/src/loader/loaders.cpp b/src/loader/loaders.cpp
index 439d8930d..5c72664d5 100644
--- a/src/loader/loaders.cpp
+++ b/src/loader/loaders.cpp
@@ -92,6 +92,9 @@ namespace loader {
     gfx902.handle = 902;
     gfx903.handle = 903;
     gfx908.handle = 908;
+    gfx1010.handle = 1010;
+    gfx1011.handle = 1011;
+    gfx1012.handle = 1012;
   }
 
   hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name)
@@ -123,6 +126,12 @@ namespace loader {
       return gfx903;
     } else if (sname == "AMD:AMDGPU:9:0:8") {
       return gfx908;
+    } else if (sname == "AMD:AMDGPU:10:1:0") {
+      return gfx1010;
+    } else if (sname == "AMD:AMDGPU:10:1:1") {
+      return gfx1011;
+    } else if (sname == "AMD:AMDGPU:10:1:2") {
+      return gfx1012;
     }
 
     assert(0);
diff --git a/src/loader/loaders.hpp b/src/loader/loaders.hpp
index dc8addd23..a16eac078 100644
--- a/src/loader/loaders.hpp
+++ b/src/loader/loaders.hpp
@@ -56,6 +56,7 @@ namespace loader {
     hsa_isa_t invalid;
     hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810;
     hsa_isa_t gfx900, gfx901, gfx902, gfx903, gfx908;
+    hsa_isa_t gfx1010, gfx1011, gfx1012;
     std::ostream& out;
     typedef std::set<void*> PointerSet;
     PointerSet pointers;