Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reapply "[AMDGPU][GlobalISel] Fix load/store of pointer vectors, buffer.*.pN (#110714)" v2 #111708

Merged

Conversation

krzysz00
Copy link
Contributor

@krzysz00 krzysz00 commented Oct 9, 2024

This adds -disable-gisel-legality-check to some gfx6 and gfx7 test lines to prevent behavior mismatches between debug and release builds

The first attempted reapply was #111059

This reverts commit e075dcf.

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 9, 2024

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Krzysztof Drewniak (krzysz00)

Changes

This adds -disable-gisel-legality-check to some gfx6 and gfx7 test lines to prevent behavior mismatches between debug and release builds

The first attempted reapply was #111059

This reverts commit e075dcf.


Patch is 338.97 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111708.diff

12 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+42-19)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h (+5-7)
  • (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+1-1)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll (+301)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir (+3289)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir (+67-4)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir (+27-24)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir (+80-72)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir (+54-44)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir (+30-20)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir (+47-36)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir (+48-36)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b35f9faf024bdb..4409a0d50e553e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -494,6 +494,8 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
     return false;
 
   const unsigned Size = Ty.getSizeInBits();
+  if (Ty.isPointerVector())
+    return true;
   if (Size <= 64)
     return false;
   // Address space 8 pointers get their own workaround.
@@ -502,9 +504,6 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
   if (!Ty.isVector())
     return true;
 
-  if (Ty.isPointerVector())
-    return true;
-
   unsigned EltSize = Ty.getScalarSizeInBits();
   return EltSize != 32 && EltSize != 64;
 }
@@ -5818,8 +5817,9 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
   return Reg;
 }
 
-Register AMDGPULegalizerInfo::fixStoreSourceType(
-  MachineIRBuilder &B, Register VData, bool IsFormat) const {
+Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
+                                                 Register VData, LLT MemTy,
+                                                 bool IsFormat) const {
   MachineRegisterInfo *MRI = B.getMRI();
   LLT Ty = MRI->getType(VData);
 
@@ -5829,6 +5829,10 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(
   if (hasBufferRsrcWorkaround(Ty))
     return castBufferRsrcToV4I32(VData, B);
 
+  if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
+    Ty = getBitcastRegisterType(Ty);
+    VData = B.buildBitcast(Ty, VData).getReg(0);
+  }
   // Fixup illegal register types for i8 stores.
   if (Ty == LLT::scalar(8) || Ty == S16) {
     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
@@ -5846,22 +5850,26 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(
 }
 
 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
-                                              MachineRegisterInfo &MRI,
-                                              MachineIRBuilder &B,
+                                              LegalizerHelper &Helper,
                                               bool IsTyped,
                                               bool IsFormat) const {
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
   Register VData = MI.getOperand(1).getReg();
   LLT Ty = MRI.getType(VData);
   LLT EltTy = Ty.getScalarType();
   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
   const LLT S32 = LLT::scalar(32);
 
-  VData = fixStoreSourceType(B, VData, IsFormat);
-  castBufferRsrcArgToV4I32(MI, B, 2);
-  Register RSrc = MI.getOperand(2).getReg();
-
   MachineMemOperand *MMO = *MI.memoperands_begin();
   const int MemSize = MMO->getSize().getValue();
+  LLT MemTy = MMO->getMemoryType();
+
+  VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
+
+  castBufferRsrcArgToV4I32(MI, B, 2);
+  Register RSrc = MI.getOperand(2).getReg();
 
   unsigned ImmOffset;
 
@@ -5954,10 +5962,13 @@ static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
 }
 
 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
-                                             MachineRegisterInfo &MRI,
-                                             MachineIRBuilder &B,
+                                             LegalizerHelper &Helper,
                                              bool IsFormat,
                                              bool IsTyped) const {
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+  GISelChangeObserver &Observer = Helper.Observer;
+
   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
   MachineMemOperand *MMO = *MI.memoperands_begin();
   const LLT MemTy = MMO->getMemoryType();
@@ -6006,9 +6017,21 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
   // logic doesn't have to handle that case.
   if (hasBufferRsrcWorkaround(Ty)) {
+    Observer.changingInstr(MI);
     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
+    Observer.changedInstr(MI);
     Dst = MI.getOperand(0).getReg();
+    B.setInsertPt(B.getMBB(), MI);
   }
+  if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
+    Ty = getBitcastRegisterType(Ty);
+    Observer.changingInstr(MI);
+    Helper.bitcastDst(MI, Ty, 0);
+    Observer.changedInstr(MI);
+    Dst = MI.getOperand(0).getReg();
+    B.setInsertPt(B.getMBB(), MI);
+  }
+
   LLT EltTy = Ty.getScalarType();
   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
   const bool Unpacked = ST.hasUnpackedD16VMem();
@@ -7388,17 +7411,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_raw_ptr_buffer_store:
   case Intrinsic::amdgcn_struct_buffer_store:
   case Intrinsic::amdgcn_struct_ptr_buffer_store:
-    return legalizeBufferStore(MI, MRI, B, false, false);
+    return legalizeBufferStore(MI, Helper, false, false);
   case Intrinsic::amdgcn_raw_buffer_store_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
   case Intrinsic::amdgcn_struct_buffer_store_format:
   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
-    return legalizeBufferStore(MI, MRI, B, false, true);
+    return legalizeBufferStore(MI, Helper, false, true);
   case Intrinsic::amdgcn_raw_tbuffer_store:
   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
   case Intrinsic::amdgcn_struct_tbuffer_store:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
-    return legalizeBufferStore(MI, MRI, B, true, true);
+    return legalizeBufferStore(MI, Helper, true, true);
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
   case Intrinsic::amdgcn_raw_atomic_buffer_load:
@@ -7407,17 +7430,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_struct_ptr_buffer_load:
   case Intrinsic::amdgcn_struct_atomic_buffer_load:
   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
-    return legalizeBufferLoad(MI, MRI, B, false, false);
+    return legalizeBufferLoad(MI, Helper, false, false);
   case Intrinsic::amdgcn_raw_buffer_load_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
   case Intrinsic::amdgcn_struct_buffer_load_format:
   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
-    return legalizeBufferLoad(MI, MRI, B, true, false);
+    return legalizeBufferLoad(MI, Helper, true, false);
   case Intrinsic::amdgcn_raw_tbuffer_load:
   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
   case Intrinsic::amdgcn_struct_tbuffer_load:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
-    return legalizeBufferLoad(MI, MRI, B, true, true);
+    return legalizeBufferLoad(MI, Helper, true, true);
   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 84470dc75b60ef..86c15197805d23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -195,15 +195,13 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
                           Register Reg, bool ImageStore = false) const;
-  Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
+  Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy,
                               bool IsFormat) const;
 
-  bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
-                           MachineIRBuilder &B, bool IsTyped,
-                           bool IsFormat) const;
-  bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
-                          MachineIRBuilder &B, bool IsFormat,
-                          bool IsTyped) const;
+  bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper,
+                           bool IsTyped, bool IsFormat) const;
+  bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper,
+                          bool IsFormat, bool IsTyped) const;
   bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                             Intrinsic::ID IID) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ef9adde13348fe..902feacede83f4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -590,7 +590,7 @@ class RegisterTypes<list<ValueType> reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, p1, p4, v4i16, v4f16, v4bf16]>;
 def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
 def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
new file mode 100644
index 00000000000000..091c9f143ce7ee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
@@ -0,0 +1,301 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck --check-prefix=GFX9 %s
+
+define ptr @buffer_load_p0(ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_load_p0
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %ret = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret ptr %ret
+}
+
+define void @buffer_store_p0(ptr %data, ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_store_p0
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+  ; GFX9-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   SI_RETURN
+  call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define ptr addrspace(1) @buffer_load_p1(ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_load_p1
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %ret = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret ptr addrspace(1) %ret
+}
+
+define void @buffer_store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_store_p1
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+  ; GFX9-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   SI_RETURN
+  call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define ptr addrspace(4) @buffer_load_p4(ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_load_p4
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
+  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %ret = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret ptr addrspace(4) %ret
+}
+
+define void @buffer_store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_store_p4
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+  ; GFX9-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   SI_RETURN
+  call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define ptr addrspace(5) @buffer_load_p5(ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_load_p5
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0
+  %ret = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret ptr addrspace(5) %ret
+}
+
+define void @buffer_store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_store_p5
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr16
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr17
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX9-NEXT:   BUFFER_STORE_DWORD_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.buf, align 1, addrspace 8)
+  ; GFX9-NEXT:   SI_RETURN
+  call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define <2 x ptr addrspace(1)> @buffer_load_v2p1(ptr addrspace(8) inreg %buf) {
+  ; GFX9-LABEL: name: buffer_load_v2p1
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_...
[truncated]

The instruction selection tests had illegal instruction sequences in
them, which the new tests don't generate.
@krzysz00 krzysz00 merged commit 4b4a0d4 into llvm:main Oct 9, 2024
8 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 9, 2024

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 10 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/6832

Here is the relevant piece of the build log for the reference
Step 10 (Add check check-offload) failure: 1200 seconds without output running [b'ninja', b'-j 32', b'check-offload'], attempting to kill
...
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug53727.cpp (866 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug50022.cpp (867 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug47654.cpp (868 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/test_libc.cpp (869 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/wtime.c (870 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp (871 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp (872 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/complex_reduction.cpp (873 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp (874 of 879)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp (875 of 879)
command timed out: 1200 seconds without output running [b'ninja', b'-j 32', b'check-offload'], attempting to kill
process killed by signal 9
program finished with exit code -1
elapsedTime=1237.519794

@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 10, 2024

LLVM Buildbot has detected a new failure on builder bolt-x86_64-ubuntu-nfc running on bolt-worker while building llvm at step 8 "test-build-bolt-check-bolt".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/92/builds/7825

Here is the relevant piece of the build log for the reference
Step 8 (test-build-bolt-check-bolt) failure: test (failure)
******************** TEST 'BOLT :: perf2bolt/perf_test.test' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 5: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/clang /home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/Inputs/perf_test.c -fuse-ld=lld -Wl,--script=/home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/Inputs/perf_test.lds -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/clang /home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/Inputs/perf_test.c -fuse-ld=lld -Wl,--script=/home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/Inputs/perf_test.lds -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp
RUN: at line 6: perf record -Fmax -e cycles:u -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp2 -- /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp
+ perf record -Fmax -e cycles:u -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp2 -- /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp
info: Using a maximum frequency rate of 2000 Hz
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.002 MB /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp2 (9 samples) ]
RUN: at line 7: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/perf2bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp -p=/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp2 -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp3 -nl -ignore-build-id 2>&1 | /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/perf_test.test
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/perf_test.test
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/perf2bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp -p=/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp2 -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp3 -nl -ignore-build-id
/home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/perf_test.test:10:12: error: CHECK-NOT: excluded string found in input
CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection.
           ^
<stdin>:27:2: note: found here
 !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection. The generated data may be ineffective for improving performance.
 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Input file: <stdin>
Check file: /home/worker/bolt-worker2/llvm-project/bolt/test/perf2bolt/perf_test.test

-dump-input=help explains the following input dump.

Input was:
<<<<<<
        .
        .
        .
       22: BOLT-WARNING: Running parallel work of 0 estimated cost, will switch to trivial scheduling. 
       23: PERF2BOLT: processing basic events (without LBR)... 
       24: PERF2BOLT: read 9 samples 
       25: PERF2BOLT: out of range samples recorded in unknown regions: 9 (100.0%) 
       26:  
       27:  !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection. The generated data may be ineffective for improving performance. 
not:10      !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~                                                                   error: no match expected
       28:  
       29: PERF2BOLT: wrote 0 objects and 0 memory objects to /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolt/test/perf2bolt/Output/perf_test.test.tmp3 
       30: BOLT-INFO: 0 out of 13 functions in the binary (0.0%) have non-empty execution profile 
>>>>>>

--

********************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants