diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp
index 394c77d09387b..1604d7e104ac0 100644
--- a/sycl-fusion/passes/internalization/Internalization.cpp
+++ b/sycl-fusion/passes/internalization/Internalization.cpp
@@ -14,6 +14,7 @@
 #include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/TypeSwitch.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PatternMatch.h>
 #include <llvm/Support/WithColor.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
@@ -25,6 +26,7 @@
 #define DEBUG_TYPE "sycl-fusion"
 
 using namespace llvm;
+using namespace PatternMatch;
 
 constexpr static StringLiteral PrivatePromotion{"private"};
 constexpr static StringLiteral LocalPromotion{"local"};
@@ -191,22 +193,10 @@ static void updateInternalizationMD(Function *F, StringRef Kind,
 /// address space has changed from N to N / LocalSize.
 static void remap(GetElementPtrInst *GEPI, const PromotionInfo &PromInfo) {
   IRBuilder<> Builder{GEPI};
-  Value *C0 = Builder.getInt64(0);
-
-  auto NIdx = GEPI->getNumIndices();
-  if (NIdx > 1) {
-    // `GEPI` indexes into an aggregate. If the first index is 0, the base
-    // pointer is used as-is and we do not need to perform remapping. This is
-    // the common case.
-    // TODO: Support non-zero pointer offset, too. If the pointer operand is
-    //       a GEP as well, we must check if the source element types match.
-    assert(GEPI->idx_begin()->get() == C0);
-    return;
-  }
 
   if (PromInfo.LocalSize == 1) {
     // Squash the index and let instcombine clean-up afterwards.
-    GEPI->idx_begin()->set(C0);
+    GEPI->idx_begin()->set(Builder.getInt64(0));
     return;
   }
 
@@ -290,6 +280,43 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val,
   return Error::success();
 }
 
+enum GEPKind { INVALID = 0, NEEDS_REMAPPING, ADDRESSES_INTO_AGGREGATE };
+
+static int getGEPKind(GetElementPtrInst *GEPI, const PromotionInfo &PromInfo) {
+  assert(GEPI->getNumIndices() >= 1 && "No-op GEP encountered");
+
+  // Inspect the GEP's source element type.
+  auto &DL = GEPI->getModule()->getDataLayout();
+  auto SrcElemTySz = DL.getTypeAllocSize(GEPI->getSourceElementType());
+
+  // `GEPI`'s first index is selecting elements. Unless it is constant zero, we
+  // have to remap. If there are more indices, we start to address into an
+  // aggregate type.
+  if (SrcElemTySz == PromInfo.ElemSize) {
+    int Kind = INVALID;
+    if (!match(GEPI->idx_begin()->get(), m_ZeroInt()))
+      Kind |= NEEDS_REMAPPING;
+    if (GEPI->getNumIndices() >= 2)
+      Kind |= ADDRESSES_INTO_AGGREGATE;
+    assert(Kind != INVALID && "No-op GEP encountered");
+    return Kind;
+  }
+
+  // Check whether `GEPI` adds a constant offset, e.g. a byte offset to address
+  // into a padded structure, smaller than the element size.
+  MapVector<Value *, APInt> VariableOffsets;
+  auto IW = DL.getIndexSizeInBits(GEPI->getPointerAddressSpace());
+  APInt ConstantOffset = APInt::getZero(IW);
+  if (GEPI->collectOffset(DL, IW, VariableOffsets, ConstantOffset) &&
+      VariableOffsets.empty() &&
+      ConstantOffset.getZExtValue() < PromInfo.ElemSize) {
+    return ADDRESSES_INTO_AGGREGATE;
+  }
+
+  // We don't know what `GEPI` addresses; bail out.
+  return INVALID;
+}
+
 Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI,
                                           const Value *Val,
                                           const PromotionInfo &PromInfo,
@@ -299,12 +326,17 @@ Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI,
     // required.
     return Error::success();
   }
-  // Recurse to check all users of the GEP. We are either already in
-  // `InAggregate` mode, or inspect the current instruction. Recall that a GEP's
-  // first index is used to step through the base pointer, whereas any
-  // additional indices represent addressing into an aggregrate type.
+
+  // Inspect the current instruction.
+  auto Kind = getGEPKind(GEPI, PromInfo);
+  if (Kind == INVALID) {
+    return createStringError(inconvertibleErrorCode(),
+                             "Unsupported pointer arithmetic");
+  }
+
+  // Recurse to check all users of the GEP.
   return canPromoteValue(GEPI, PromInfo,
-                         InAggregate || GEPI->getNumIndices() >= 2);
+                         InAggregate || (Kind & ADDRESSES_INTO_AGGREGATE));
 }
 
 Error SYCLInternalizerImpl::canPromoteValue(Value *Val,
@@ -423,15 +455,17 @@ void SYCLInternalizerImpl::promoteGEPI(GetElementPtrInst *GEPI,
                                        bool InAggregate) const {
   // Not PointerType is unreachable. Other case is caught in caller.
   if (cast<PointerType>(GEPI->getType())->getAddressSpace() != AS) {
-    if (!InAggregate)
+    auto Kind = getGEPKind(GEPI, PromInfo);
+    assert(Kind != INVALID);
+
+    if (!InAggregate && (Kind & NEEDS_REMAPPING)) {
       remap(GEPI, PromInfo);
+    }
     GEPI->mutateType(PointerType::get(GEPI->getContext(), AS));
-    // Recurse to promote to all users of the GEP. We are either already in
-    // `InAggregate` mode, or inspect the current instruction. Recall that a
-    // GEP's first index is used to step through the base pointer, whereas any
-    // additional indices represent addressing into an aggregrate type.
+
+    // Recurse to promote to all users of the GEP.
     return promoteValue(GEPI, PromInfo,
-                        InAggregate || GEPI->getNumIndices() >= 2);
+                        InAggregate || (Kind & ADDRESSES_INTO_AGGREGATE));
   }
 }
 
diff --git a/sycl-fusion/test/internalization/promote-private-non-unit-cuda.ll b/sycl-fusion/test/internalization/promote-private-non-unit-cuda.ll
new file mode 100644
index 0000000000000..d786ab1529667
--- /dev/null
+++ b/sycl-fusion/test/internalization/promote-private-non-unit-cuda.ll
@@ -0,0 +1,116 @@
+; REQUIRES: cuda
+; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \
+; RUN: -passes=sycl-internalization -S %s | FileCheck %s
+
+; This test is a reduced IR version of
+; sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp for CUDA
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
+%"class.sycl::_V1::detail::array" = type { [1 x i64] }
+%struct.MyStruct = type { i32, %"class.sycl::_V1::vec" }
+%"class.sycl::_V1::vec" = type { <3 x i32> }
+
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+declare ptr @llvm.nvvm.implicit.offset() #1
+
+define void @fused_0(ptr addrspace(1) nocapture noundef align 16 %KernelOne__arg_accTmp,
+    ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp3,
+    ptr addrspace(1) nocapture noundef readonly align 4 %KernelOne__arg_accIn,
+    ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accIn6,
+    ptr addrspace(1) nocapture noundef align 1 %KernelOne__arg_accTmp27,
+    ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp210,
+    ptr addrspace(1) nocapture noundef writeonly align 4 %KernelTwo__arg_accOut,
+    ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelTwo__arg_accOut3)
+      local_unnamed_addr #3 !sycl.kernel.promote !13 !sycl.kernel.promote.localsize !14 !sycl.kernel.promote.elemsize !15 {
+; CHECK-LABEL: define void @fused_0(
+; CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP3:%[^,]*accTmp3]],
+; CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP210:%[^,]*accTmp210]]
+; CHECK:         entry:
+; CHECK:           [[TMP0:%.*]] = alloca i8, i64 3, align 1
+; CHECK:           [[TMP1:%.*]] = alloca i8, i64 96, align 16
+; CHECK:           [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP210]], align 8
+; CHECK:           [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP3]], align 8
+; CHECK:           [[TMP2:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD]], 3
+; CHECK:           [[TMP3:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD]], 3
+; CHECK:           [[MUL:%.*]] = mul nuw nsw i64 [[GLOBAL_ID:.*]], 3
+; CHECK:           [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1
+; CHECK:           [[TMP10:%.*]] = add i64 [[TMP2]], [[ADD]]
+; CHECK:           [[TMP11:%.*]] = urem i64 [[TMP10]], 3
+; CHECK:           [[ARRAYIDX_1:%.*]] = getelementptr inbounds %struct.MyStruct, ptr [[TMP1]], i64 [[TMP11]]
+
+; COM:             This i8-GEP _was_ not remapped because it addresses into a single MyStruct element
+; CHECK:           [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_1]], i64 20
+; CHECK:           store i32 {{.*}}, ptr [[ARRAYIDX_2]], align 4
+; CHECK:           [[TMP12:%.*]] = add i64 [[TMP3]], [[ADD]]
+; CHECK:           [[TMP13:%.*]] = urem i64 [[TMP12]], 3
+
+; COM:             This i8-GEP was remapped because it selects an element of the underlying i8-buffer
+; CHECK:           [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[TMP13]]
+
+; CHECK:           store i8 {{.*}}, ptr [[ARRAYIDX_3]], align 1
+; CHECK:           store i32 {{.*}}, ptr addrspace(1)
+; CHECK:           ret void
+;
+entry:
+  %KernelOne__arg_accTmp2103.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp210, align 8
+  %KernelOne__arg_accIn62.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accIn6, align 8
+  %KernelOne__arg_accTmp31.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp3, align 8
+  %add.ptr.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp, i64 %KernelOne__arg_accTmp31.sroa.0.0.copyload
+  %add.ptr.i37.i = getelementptr inbounds i32, ptr addrspace(1) %KernelOne__arg_accIn, i64 %KernelOne__arg_accIn62.sroa.0.0.copyload
+  %add.ptr.i43.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp27, i64 %KernelOne__arg_accTmp2103.sroa.0.0.copyload
+  %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %conv.i1.j7 = sext i32 %0 to i64
+  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %conv.i3.j7 = sext i32 %1 to i64
+  %mul.j7 = mul nsw i64 %conv.i3.j7, %conv.i1.j7
+  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %conv.i2.j7 = sext i32 %2 to i64
+  %add.j7 = add nsw i64 %mul.j7, %conv.i2.j7
+  %3 = tail call ptr @llvm.nvvm.implicit.offset()
+  %4 = load i32, ptr %3, align 4
+  %conv.j8 = zext i32 %4 to i64
+  %add4.j7 = add nsw i64 %add.j7, %conv.j8
+  %mul.j2 = mul nuw nsw i64 %add4.j7, 3
+  %add.j2 = add nuw nsw i64 %mul.j2, 1
+  %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i37.i, i64 %add.j2
+  %5 = load i32, ptr addrspace(1) %arrayidx.j2, align 4
+  %arrayidx.i55.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2
+  %arrayidx.j3 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i55.i, i64 20
+  store i32 %5, ptr addrspace(1) %arrayidx.j3, align 4
+  %conv.j2 = trunc i32 %5 to i8
+  %arrayidx.i73.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i43.i, i64 %add.j2
+  store i8 %conv.j2, ptr addrspace(1) %arrayidx.i73.i, align 1
+  %KernelTwo__arg_accOut34.sroa.0.0.copyload = load i64, ptr %KernelTwo__arg_accOut3, align 8
+  %add.ptr.i.i7 = getelementptr inbounds i32, ptr addrspace(1) %KernelTwo__arg_accOut, i64 %KernelTwo__arg_accOut34.sroa.0.0.copyload
+  %6 = load i32, ptr %3, align 4
+  %conv.j7.i13 = zext i32 %6 to i64
+  %add4.j6.i14 = add nsw i64 %add.j7, %conv.j7.i13
+  %mul.i.i16 = mul nuw nsw i64 %add4.j6.i14, 3
+  %add.i45.i = add nuw nsw i64 %mul.i.i16, 1
+  %arrayidx.i.i17 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.i45.i
+  %arrayidx.j2.i19 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i.i17, i64 20
+  %7 = load i32, ptr addrspace(1) %arrayidx.j2.i19, align 4
+  %arrayidx.i55.i20 = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i43.i, i64 %add.i45.i
+  %8 = load i8, ptr addrspace(1) %arrayidx.i55.i20, align 1
+  %conv.i.i22 = sext i8 %8 to i32
+  %add.i.i23 = add nsw i32 %7, %conv.i.i22
+  %arrayidx.i64.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i7, i64 %add.i45.i
+  store i32 %add.i.i23, ptr addrspace(1) %arrayidx.i64.i, align 4
+  ret void
+}
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nofree nosync nounwind speculatable memory(none) }
+attributes #3 = { nofree nosync nounwind memory(read, argmem: readwrite, inaccessiblemem: write) "frame-pointer"="all" "target-cpu"="sm_80" "target-features"="+ptx82,+sm_80" "uniform-work-group-size"="true" }
+
+!nvvm.annotations = !{!10}
+
+!10 = !{ptr @fused_0, !"kernel", i32 1}
+!13 = !{!"private", !"none", !"none", !"none", !"private", !"none", !"none", !"none"}
+!14 = !{i64 3, !"", !"", !"", i64 3, !"", !"", !""}
+!15 = !{i64 32, !"", !"", !"", i64 1, !"", !"", !""}
diff --git a/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll b/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll
new file mode 100644
index 0000000000000..b08d7ba472e57
--- /dev/null
+++ b/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll
@@ -0,0 +1,137 @@
+; REQUIRES: hip_amd
+; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \
+; RUN: -passes=sycl-internalization -S %s | FileCheck %s
+
+; This test is the IR version of
+; sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp for HIP.
+; In contrast to the SPIR-V and CUDA versions, the sycl::vec in the test data
+; structure is addressed via a multi-index GEP with a non-zero first index.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn-amd-amdhsa"
+
+%struct.MyStruct = type { i32, %"class.sycl::_V1::vec" }
+%"class.sycl::_V1::vec" = type { <3 x i32> }
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare ptr addrspace(5) @llvm.amdgcn.implicit.offset() #1
+
+define amdgpu_kernel void @fused_0(ptr addrspace(1) noundef align 16 %KernelOne__arg_accTmp31,
+     i64 %KernelOne__arg_accTmp.coerce,
+     i64 %KernelOne__arg_accTmp.coerce3,
+     i64 %KernelOne__arg_accTmp.coerce7,
+     ptr addrspace(1) noundef align 4 %KernelOne__arg_accIn35,
+     i64 %KernelOne__arg_accIn.coerce,
+     i64 %KernelOne__arg_accIn.coerce13,
+     i64 %KernelOne__arg_accIn.coerce17,
+     ptr addrspace(1) noundef align 1 %KernelOne__arg_accTmp239,
+     i64 %KernelOne__arg_accTmp2.coerce,
+     i64 %KernelOne__arg_accTmp2.coerce24,
+     i64 %KernelOne__arg_accTmp2.coerce28,
+     ptr addrspace(1) noundef align 4 %KernelTwo__arg_accOut30,
+     i64 %KernelTwo__arg_accOut.coerce,
+     i64 %KernelTwo__arg_accOut.coerce3,
+     i64 %KernelTwo__arg_accOut.coerce7)
+       #3 !sycl.kernel.promote !12 !sycl.kernel.promote.localsize !13 !sycl.kernel.promote.elemsize !14 {
+; CHECK-LABEL: define amdgpu_kernel void @fused_0(
+; CHECK-SAME: i64 [[KERNELONE__ARG_ACCTMP_COERCE7:%[^,]*accTmp.coerce7]]
+; CHECK-SAME: i64 [[KERNELONE__ARG_ACCTMP2_COERCE28:%[^,]*accTmp2.coerce28]]
+; CHECK:         entry:
+; CHECK:           [[TMP0:%.*]] = alloca i8, i64 3, align 1, addrspace(5)
+; CHECK:           [[TMP1:%.*]] = alloca i8, i64 96, align 16, addrspace(5)
+; CHECK:           [[TMP2:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP_COERCE7]], 3
+; CHECK:           [[TMP3:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP2_COERCE28]], 3
+; CHECK:           [[MUL:%.*]] = mul nuw nsw i64 [[GLOBAL_ID:.*]], 3
+; CHECK:           [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1
+; CHECK:           [[TMP11:%.*]] = add i64 [[TMP2]], [[ADD]]
+; CHECK:           [[TMP12:%.*]] = urem i64 [[TMP11]], 3
+
+; COM:             This is a multi-index GEP into the aggregate. We have to remap the first index.
+; CHECK:           [[V:%.*]] = getelementptr inbounds %struct.MyStruct, ptr addrspace(5) [[TMP1]], i64 [[TMP12]], i32 1
+
+; COM:             This is a single-index GEP which shall not be remapped because its pointer operand already points into the struct (see above).
+; CHECK:           [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[V]], i64 1
+
+; CHECK:           store i32 {{.*}}, ptr addrspace(5) [[ARRAYIDX_1]], align 4
+; CHECK:           [[TMP13:%.*]] = add i64 [[TMP3]], [[ADD]]
+; CHECK:           [[TMP14:%.*]] = urem i64 [[TMP13]], 3
+
+; COM:             This i8-GEP was remapped because it selects an element of the underlying i8-buffer
+; CHECK:           [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[TMP0]], i64 [[TMP14]]
+; CHECK:           store i8 {{.*}}, ptr addrspace(5) [[ARRAYIDX_2]], align 1
+; CHECK:           store i32 {{.*}} ptr addrspace(1)
+; CHECK:           ret void
+;
+entry:
+  %add.ptr.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp31, i64 %KernelOne__arg_accTmp.coerce7
+  %add.ptr.i82.i = getelementptr inbounds i32, ptr addrspace(1) %KernelOne__arg_accIn35, i64 %KernelOne__arg_accIn.coerce17
+  %add.ptr.i85.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp239, i64 %KernelOne__arg_accTmp2.coerce28
+  %0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  %conv.i1.j7 = zext i32 %0 to i64
+  %1 = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %arrayidx.j8 = getelementptr inbounds i16, ptr addrspace(4) %1, i64 2
+  %2 = load i16, ptr addrspace(4) %arrayidx.j8, align 4
+  %conv.j8 = zext i16 %2 to i64
+  %mul.j7 = mul nuw nsw i64 %conv.j8, %conv.i1.j7
+  %3 = call i32 @llvm.amdgcn.workitem.id.x(), !range !20, !noundef !21
+  %conv.i2.j7 = zext nneg i32 %3 to i64
+  %add.j7 = add nuw nsw i64 %mul.j7, %conv.i2.j7
+  %4 = call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
+  %5 = load i32, ptr addrspace(5) %4, align 4
+  %zext.j8 = zext i32 %5 to i64
+  %add4.j7 = add nuw nsw i64 %add.j7, %zext.j8
+  %mul.j2 = mul nuw nsw i64 %add4.j7, 3
+  %add.j2 = add nuw nsw i64 %mul.j2, 1
+  %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i82.i, i64 %add.j2
+  %6 = load i32, ptr addrspace(1) %arrayidx.j2, align 4
+  %v.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2, i32 1
+  %arrayidx.j3 = getelementptr inbounds i32, ptr addrspace(1) %v.j2, i64 1
+  store i32 %6, ptr addrspace(1) %arrayidx.j3, align 4
+  %conv.j2 = trunc i32 %6 to i8
+  %arrayidx.i104.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i85.i, i64 %add.j2
+  store i8 %conv.j2, ptr addrspace(1) %arrayidx.i104.i, align 1
+  %add.ptr.i.i1 = getelementptr inbounds i32, ptr addrspace(1) %KernelTwo__arg_accOut30, i64 %KernelTwo__arg_accOut.coerce7
+  %add.ptr.i81.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp31, i64 %KernelOne__arg_accTmp.coerce7
+  %add.ptr.i84.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp239, i64 %KernelOne__arg_accTmp2.coerce28
+  %7 = call i32 @llvm.amdgcn.workgroup.id.x()
+  %conv.i1.j6.i2 = zext i32 %7 to i64
+  %8 = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %arrayidx.j7.i3 = getelementptr inbounds i16, ptr addrspace(4) %8, i64 2
+  %9 = load i16, ptr addrspace(4) %arrayidx.j7.i3, align 4
+  %conv.j7.i4 = zext i16 %9 to i64
+  %mul.j6.i5 = mul nuw nsw i64 %conv.j7.i4, %conv.i1.j6.i2
+  %10 = call i32 @llvm.amdgcn.workitem.id.x(), !range !20, !noundef !21
+  %conv.i2.j6.i6 = zext nneg i32 %10 to i64
+  %add.j6.i7 = add nuw nsw i64 %mul.j6.i5, %conv.i2.j6.i6
+  %11 = call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
+  %12 = load i32, ptr addrspace(5) %11, align 4
+  %zext.j7.i8 = zext i32 %12 to i64
+  %add4.j6.i9 = add nuw nsw i64 %add.j6.i7, %zext.j7.i8
+  %mul.i.i11 = mul nuw nsw i64 %add4.j6.i9, 3
+  %add.i87.i = add nuw nsw i64 %mul.i.i11, 1
+  %v.i.i12 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.i81.i, i64 %add.i87.i, i32 1
+  %arrayidx.j2.i13 = getelementptr inbounds i32, ptr addrspace(1) %v.i.i12, i64 1
+  %13 = load i32, ptr addrspace(1) %arrayidx.j2.i13, align 4
+  %arrayidx.i92.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i84.i, i64 %add.i87.i
+  %14 = load i8, ptr addrspace(1) %arrayidx.i92.i, align 1
+  %conv.i.i14 = sext i8 %14 to i32
+  %add.i.i15 = add nsw i32 %13, %conv.i.i14
+  %arrayidx.i98.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i1, i64 %add.i87.i
+  store i32 %add.i.i15, ptr addrspace(1) %arrayidx.i98.i, align 4
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind speculatable memory(none) }
+attributes #3 = { "frame-pointer"="all" "target-cpu"="gfx1031" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
+
+!amdgcn.annotations = !{!9}
+
+!9 = !{ptr @fused_0, !"kernel", i32 1}
+!12 = !{!"private", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"private", !"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+!13 = !{i64 3, !"", !"", !"", !"", !"", !"", !"", i64 3, !"", !"", !"", !"", !"", !"", !""}
+!14 = !{i64 32, !"", !"", !"", !"", !"", !"", !"", i64 1, !"", !"", !"", !"", !"", !"", !""}
+!20 = !{i32 0, i32 1024}
+!21 = !{}
diff --git a/sycl-fusion/test/internalization/promote-private-non-unit.ll b/sycl-fusion/test/internalization/promote-private-non-unit.ll
new file mode 100644
index 0000000000000..b4b73aa9881b9
--- /dev/null
+++ b/sycl-fusion/test/internalization/promote-private-non-unit.ll
@@ -0,0 +1,91 @@
+; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \
+; RUN:   -passes=sycl-internalization -S %s | FileCheck %s
+
+; This test is a reduced IR version of
+; sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
+%"class.sycl::_V1::detail::array" = type { [1 x i64] }
+%struct.MyStruct = type { i32, %"class.sycl::_V1::vec" }
+%"class.sycl::_V1::vec" = type { <3 x i32> }
+
+declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) local_unnamed_addr #1
+
+define spir_kernel void @fused_0(ptr addrspace(1) nocapture align 16 %KernelOne__arg_accTmp,
+   ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp3,
+   ptr addrspace(1) nocapture readonly align 4 %KernelOne__arg_accIn,
+   ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accIn6,
+   ptr addrspace(1) nocapture writeonly align 1 %KernelOne__arg_accTmp27,
+   ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp210,
+   ptr addrspace(1) nocapture writeonly align 4 %KernelTwo__arg_accOut,
+   ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelTwo__arg_accOut3)
+     local_unnamed_addr #2 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.promote.elemsize !13 {
+; CHECK-LABEL: define spir_kernel void @fused_0(
+; CHECK-SAME: ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP3:%[^,]*accTmp3]],
+; CHECK-SAME: ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP210:%[^,]*accTmp210]]
+; CHECK:         entry:
+; CHECK:           [[TMP0:%.*]] = alloca i8, i64 3, align 1
+; CHECK:           [[TMP1:%.*]] = alloca i8, i64 96, align 16
+; CHECK:           [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP210]], align 8
+; CHECK:           [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP3]], align 8
+; CHECK:           [[TMP2:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD]], 3
+; CHECK:           [[TMP3:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD]], 3
+; CHECK:           [[TMP4:%.*]] = tail call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0)
+; CHECK:           [[MUL:%.*]] = mul nuw nsw i64 [[TMP4]], 3
+; CHECK:           [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1
+; CHECK:           [[TMP6:%.*]] = add i64 [[TMP2]], [[ADD]]
+; CHECK:           [[TMP7:%.*]] = urem i64 [[TMP6]], 3
+; CHECK:           [[ARRAYIDX_1:%.*]] = getelementptr inbounds %struct.MyStruct, ptr [[TMP1]], i64 [[TMP7]]
+
+; COM:             This i8-GEP _was_ not remapped because it addresses into a single MyStruct element
+; CHECK:           [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_1]], i64 20
+
+; CHECK:           store i32 {{.*}}, ptr [[ARRAYIDX_2]], align 4
+; CHECK:           [[TMP8:%.*]] = add i64 [[TMP3]], [[ADD]]
+; CHECK:           [[TMP9:%.*]] = urem i64 [[TMP8]], 3
+
+; COM:             This i8-GEP was remapped because it selects an element of the underlying i8-buffer
+; CHECK:           [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[TMP9]]
+
+; CHECK:           store i8 {{.*}}, ptr [[ARRAYIDX_3]], align 1
+; CHECK:           store i32 {{.*}}, ptr addrspace(1)
+; CHECK:           ret void
+;
+entry:
+  %KernelOne__arg_accTmp2103.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp210, align 8
+  %KernelOne__arg_accIn62.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accIn6, align 8
+  %KernelOne__arg_accTmp31.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp3, align 8
+  %add.ptr.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp, i64 %KernelOne__arg_accTmp31.sroa.0.0.copyload
+  %add.ptr.i35.i = getelementptr inbounds i32, ptr addrspace(1) %KernelOne__arg_accIn, i64 %KernelOne__arg_accIn62.sroa.0.0.copyload
+  %add.ptr.i44.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp27, i64 %KernelOne__arg_accTmp2103.sroa.0.0.copyload
+  %0 = tail call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #3
+  %mul.j2 = mul nuw nsw i64 %0, 3
+  %add.j2 = add nuw nsw i64 %mul.j2, 1
+  %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i35.i, i64 %add.j2
+  %1 = load i32, ptr addrspace(1) %arrayidx.j2, align 4
+  %arrayidx.i54.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2
+  %arrayidx.j3 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i54.i, i64 20
+  store i32 %1, ptr addrspace(1) %arrayidx.j3, align 4
+  %conv.j2 = trunc i32 %1 to i8
+  %arrayidx.i70.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i44.i, i64 %add.j2
+  store i8 %conv.j2, ptr addrspace(1) %arrayidx.i70.i, align 1
+  %KernelTwo__arg_accOut34.sroa.0.0.copyload = load i64, ptr %KernelTwo__arg_accOut3, align 8
+  %add.ptr.i.i7 = getelementptr inbounds i32, ptr addrspace(1) %KernelTwo__arg_accOut, i64 %KernelTwo__arg_accOut34.sroa.0.0.copyload
+  %2 = load i32, ptr addrspace(1) %arrayidx.j3, align 4
+  %conv.i.i13 = sext i8 %conv.j2 to i32
+  %add.i.i14 = add nsw i32 %2, %conv.i.i13
+  %arrayidx.i62.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i7, i64 %add.j2
+  store i32 %add.i.i14, ptr addrspace(1) %arrayidx.i62.i, align 4
+  ret void
+}
+
+attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: write) }
+attributes #3 = { nounwind willreturn memory(none) }
+
+!11 = !{!"private", !"none", !"none", !"none", !"private", !"none", !"none", !"none"}
+!12 = !{i64 3, !"", !"", !"", i64 3, !"", !"", !""}
+!13 = !{i64 32, !"", !"", !"", i64 1, !"", !"", !""}
diff --git a/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp b/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp
new file mode 100644
index 0000000000000..3b302b733f5c5
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp
@@ -0,0 +1,96 @@
+// REQUIRES: fusion
+// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{run} %t.out
+
+// Test private internalization with "LocalSize" == 3 on buffers that trigger
+// special cases in the GEP analyis during remapping.
+// - `tmp`:
+//   - On SPIR-V and CUDA targets, the IR contains `i8`-typed GEPs to access the
+//     elements in the sycl::vec. These GEPs shall _not_ be remapped.
+//   - On HIP, the IR contains GEP instructions that add a pointer offset (hence
+//     must be remapped) _and_ address into the aggregate element.
+// - `tmp2` is an `i8` buffer. The corresponding `i8`-typed GEPs must be
+//   remapped during internalization.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct MyStruct {
+  int pad;
+  sycl::int3 v;
+};
+
+int main() {
+  constexpr int dataSize = 384;
+  std::array<int, dataSize> in, out;
+  std::array<MyStruct, dataSize> tmp;
+  std::array<char, dataSize> tmp2;
+
+  for (int i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i].v.y() = -1;
+    tmp2[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn{in.data(), range{dataSize}};
+    buffer<MyStruct> bTmp{tmp.data(), range{dataSize}};
+    buffer<char> bTmp2{tmp2.data(), range{dataSize}};
+    buffer<int> bOut{out.data(), range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accTmp2 = bTmp2.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(range<1>{dataSize / 3}, [=](id<1> i) {
+        accTmp[3 * i].v.y() = accIn[3 * i];
+        accTmp[3 * i + 1].v.y() = accIn[3 * i + 1];
+        accTmp[3 * i + 2].v.y() = accIn[3 * i + 2];
+        accTmp2[3 * i + 2] = static_cast<char>(accIn[3 * i] ^ 0xAA);
+        accTmp2[3 * i + 1] = static_cast<char>(accIn[3 * i + 1] ^ 0xAA);
+        accTmp2[3 * i] = static_cast<char>(accIn[3 * i + 2] ^ 0xAA);
+      });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accTmp2 = bTmp2.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn = bIn.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(range<1>{dataSize / 3}, [=](id<1> i) {
+        accOut[3 * i] = accTmp[3 * i].v.y() * accIn[3 * i] + accTmp2[3 * i + 2];
+        accOut[3 * i + 1] =
+            accTmp[3 * i + 1].v.y() * accIn[3 * i + 1] + accTmp2[3 * i + 1];
+        accOut[3 * i + 2] =
+            accTmp[3 * i + 2].v.y() * accIn[3 * i + 2] + accTmp2[3 * i];
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (int i = 0; i < dataSize; ++i) {
+    assert(out[i] == (i * i + static_cast<char>(i ^ 0xAA)) &&
+           "Computation error");
+    assert(tmp[i].v.y() == -1 && tmp2[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}