diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index e5dcf9ce309cd8..32cb1056022de2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -77,6 +77,29 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ret void } +define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr, double %data) #0 { + ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate + ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret void +} + define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -104,8 +127,36 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ret double %ret } +define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, double %data) #0 { + ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate + ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %ret +} + declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double) attributes #0 = { nounwind } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll new file mode 100644 index 00000000000000..64bd4804ccd519 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -0,0 +1,6804 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_add_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_add_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_add_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_add_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_add_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_add_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_and_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_and_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_and_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_and_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_and_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_and_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_sub_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_sub_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_sub_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_sub_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_sub_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_sub_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_max_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_max_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_max_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_max_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_max_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umax_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umax_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umax_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umax_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_min_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_min_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_min_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_min_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_min_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umin_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umin_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_umin_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_umin_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umin_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umin_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_or_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_or_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_or_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_or_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_or_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_or_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { +; GFX7-LABEL: atomic_xchg_f64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_f64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr double, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { +; GFX7-LABEL: atomic_xchg_pointer_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_pointer_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_pointer_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr ptr, ptr %out, i32 4 + %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xchg_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xchg_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xchg_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xor_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xor_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_xor_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_xor_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_xor_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_xor_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %in, i64 4 + %val = load atomic i64, ptr %gep seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %in, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %val = load atomic i64, ptr %gep seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %in, i64 %index + %val = load atomic i64, ptr %ptr seq_cst, align 8 + store i64 %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { +; GFX7-LABEL: atomic_store_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s0, s2, 32 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_addc_u32 s1, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s2, 32 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + store atomic i64 %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { +; GFX7-LABEL: atomic_store_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + store atomic i64 %in, ptr %out seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + store atomic i64 %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + store atomic i64 %in, ptr %ptr seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s2, s4, 32 +; GFX7-NEXT: s_addc_u32 s3, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s2, s4, 32 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_soffset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s2, s4, 0x11940 +; GFX7-NEXT: s_addc_u32 s3, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_soffset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s2, s4, 0x11940 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_soffset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 9000 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_add_u32 s0, s4, s2 +; GFX7-NEXT: s_addc_u32 s3, s5, s3 +; GFX7-NEXT: s_add_u32 s2, s0, 32 +; GFX7-NEXT: s_addc_u32 s3, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s4, s2 +; GFX8-NEXT: s_addc_u32 s3, s5, s3 +; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: s_addc_u32 s3, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { +; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX7-NEXT: s_add_u32 s2, s4, s2 +; GFX7-NEXT: s_addc_u32 s3, s5, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX8-NEXT: s_add_u32 s2, s4, s2 +; GFX8-NEXT: s_addc_u32 s3, s5, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_f64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr double, ptr %in, i64 4 + %val = load atomic double, ptr %gep seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { +; GFX7-LABEL: atomic_load_f64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_f64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %in, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 + %val = load atomic double, ptr %gep seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_load_f64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_load_f64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %in, i64 %index + %val = load atomic double, ptr %ptr seq_cst, align 8 + store double %val, ptr %out + ret void +} + +define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { +; GFX7-LABEL: atomic_store_f64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s0, s2, 32 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_addc_u32 s1, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s2, 32 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr double, ptr %out, i64 4 + store atomic double %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { +; GFX7-LABEL: atomic_store_f64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + store atomic double %in, ptr %out seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_f64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %out, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 + store atomic double %in, ptr %gep seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { +; GFX7-LABEL: atomic_store_f64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s6, s0 +; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_store_f64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s6, s0 +; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr double, ptr %out, i64 %index + store atomic double %in, ptr %ptr seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_inc_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_inc_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_incr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_incr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_inc_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_inc_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_incr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_incr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_incr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_inc_i64_ret_incr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_inc_i64_ret_incr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_i64_ret_incr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_dec_i64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_dec_i64_ret_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_decr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_decr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_dec_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GFX7-LABEL: atomic_dec_i64_ret: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_decr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX7-NEXT: s_add_u32 s0, s4, s0 +; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_decr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX8-NEXT: s_add_u32 s0, s4, s0 +; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_decr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_dec_i64_ret_decr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_dec_i64_ret_decr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_i64_ret_decr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +!0 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll new file mode 100644 index 00000000000000..edd5620dc41128 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -0,0 +1,9196 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s + +; --------------------------------------------------------------------- +; atomicrmw xchg +; --------------------------------------------------------------------- + +define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xchg f64 +; --------------------------------------------------------------------- + +define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i32 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 + ret double %result +} + +define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) { +; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret double %result +} + +; --------------------------------------------------------------------- +; atomicrmw add +; --------------------------------------------------------------------- + +define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw sub +; --------------------------------------------------------------------- + +define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw and +; --------------------------------------------------------------------- + +define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v4 +; GFX7-NEXT: v_not_b32_e32 v4, v8 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v8 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v4 +; GFX7-NEXT: v_not_b32_e32 v4, v8 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v8 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v0 +; GFX7-NEXT: v_not_b32_e32 v6, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX8-NEXT: v_not_b32_e32 v7, v0 +; GFX8-NEXT: v_not_b32_e32 v6, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX9-NEXT: v_not_b32_e32 v5, v0 +; GFX9-NEXT: v_not_b32_e32 v4, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v0 +; GFX7-NEXT: v_not_b32_e32 v4, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v6, v2 +; GFX8-NEXT: v_not_b32_e32 v5, v0 +; GFX8-NEXT: v_not_b32_e32 v4, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v0 +; GFX7-NEXT: v_not_b32_e32 v6, v1 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v8, v2 +; GFX8-NEXT: v_not_b32_e32 v7, v0 +; GFX8-NEXT: v_not_b32_e32 v6, v1 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw or +; --------------------------------------------------------------------- + +define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw max +; --------------------------------------------------------------------- + +define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB80_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB80_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB81_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB81_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB82_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB82_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB83_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB83_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB84_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB84_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB85_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB85_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB86_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB86_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB87_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB87_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB88_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB88_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB89_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB89_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB90_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB90_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_max_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB91_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_max_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB91_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_max_i64_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB92_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB92_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB93_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB93_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB94_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB94_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB95_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB95_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB96_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB96_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB97_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB97_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB98_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB98_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB99_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB99_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB100_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB100_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB101_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB101_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB102_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB102_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_umax_i64_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB103_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB103_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_umax_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB104_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_umax_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB104_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_umax_i64_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB105_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB105_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB106_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB106_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umin +; --------------------------------------------------------------------- + +define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB107_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB107_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB108_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB108_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB109_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB109_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB110_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB110_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB111_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB111_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB112_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB112_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB113_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB113_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB114_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB114_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB115_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB115_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB116_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB116_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw min +; --------------------------------------------------------------------- + +define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB117_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB117_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB118_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB118_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB119_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB119_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB120_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB120_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB121_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB121_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB122_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB122_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB123_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB123_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB123_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB124_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB124_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB124_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB125_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB125_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB125_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB126_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB126_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB126_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { +; GFX7-LABEL: atomic_min_i64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB127_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB127_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB127_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GFX7-LABEL: atomic_min_i64_ret_addr64: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s6 +; GFX7-NEXT: s_addc_u32 s1, s1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_cbranch_execnz .LBB128_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: atomic_min_i64_ret_addr64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_cbranch_execnz .LBB128_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i64_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB128_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1 + store i64 %tmp0, ptr %out2 + ret void +} + +define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB129_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB129_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB130_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB130_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw uinc_wrap +; --------------------------------------------------------------------- + +define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw udec_wrap +; --------------------------------------------------------------------- + +define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret void +} + +define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + ret i64 %result +} + +define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret void +} + +define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { +; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 + ret i64 %result +} + +!0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll new file mode 100644 index 00000000000000..3de502874d3237 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -0,0 +1,1523 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX900 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX12 %s + +; -------------------------------------------------------------------- +; Idempotent expansion cases without noalias.addrspace +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; -------------------------------------------------------------------- +; Idempotent expansion cases with noalias.addrspace +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; -------------------------------------------------------------------- +; General expansion for add +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; -------------------------------------------------------------------- +; General expansion for xchg +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; -------------------------------------------------------------------- +; General expansion for xchg (pointer type) +; -------------------------------------------------------------------- + +define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspace(1) %value) { +; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent( +; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret ptr addrspace(1) [[RES]] +; + %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret ptr addrspace(1) %res +} + +define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(1) %value) { +; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret ptr addrspace(1) [[RES]] +; + %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret ptr addrspace(1) %res +} + +define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(3) %value) { +; ALL-LABEL: define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(3) [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret ptr addrspace(3) [[RES]] +; + %res = atomicrmw xchg ptr %ptr, ptr addrspace(3) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret ptr addrspace(3) %res +} + +; -------------------------------------------------------------------- +; General expansion for and +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret i64 %res +} + + +define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw and ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; -------------------------------------------------------------------- +; General expansion for fadd +; -------------------------------------------------------------------- + +define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP5]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP5]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP5]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret double %res +} + +define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { +; GFX7-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret float [[TMP5]] +; +; GFX900-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]] +; GFX90A: [[ATOMICRMW_SHARED]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_CHECK_PRIVATE]]: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4 +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fadd ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr %ptr, <2 x half> %value) { +; GFX7-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret <2 x half> [[TMP5]] +; +; GFX900-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret <2 x half> [[TMP5]] +; +; GFX940-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX12-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(ptr %ptr, <2 x bfloat> %value) { +; GFX7-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX900-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX12-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +; -------------------------------------------------------------------- +; General expansion for fmin +; -------------------------------------------------------------------- + +define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP6]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret double %res +} + +define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { +; GFX7-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret float [[RES]] +; +; GFX900-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[LOADED_PHI:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX940-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX940: [[ATOMICRMW_START]]: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX940: [[ATOMICRMW_END]]: +; GFX940-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +; -------------------------------------------------------------------- +; General expansion for fmax +; -------------------------------------------------------------------- + +define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret double [[RES]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) { +; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX7: [[ATOMICRMW_START]]: +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END]]: +; GFX7-NEXT: ret double [[TMP6]] +; +; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( +; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX12: [[ATOMICRMW_START]]: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END]]: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret double %res +} + +define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { +; GFX7-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: ret float [[RES]] +; +; GFX900-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX900: [[ATOMICRMW_START]]: +; GFX900-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END]]: +; GFX900-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX908: [[ATOMICRMW_START]]: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END]]: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX90A-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX90A: [[ATOMICRMW_START]]: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[LOADED_PHI:%.*]], %[[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX90A: [[ATOMICRMW_END]]: +; GFX90A-NEXT: ret float [[LOADED_PHI]] +; +; GFX940-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR]], align 4 +; GFX940-NEXT: br label %[[ATOMICRMW_START:.*]] +; GFX940: [[ATOMICRMW_START]]: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX940: [[ATOMICRMW_END]]: +; GFX940-NEXT: ret float [[RES]] +; +; GFX12-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( +; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +; -------------------------------------------------------------------- +; General expansion for nand +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_nand_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret i64 %res +} + + +define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR]], align 4 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; ALL-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; ALL-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP3]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw nand ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +!0 = !{} +!1 = !{i32 5, i32 6} + +;. +; GFX7: [[META0]] = !{} +; GFX7: [[META1]] = !{i32 5, i32 6} +;. +; GFX900: [[META0]] = !{} +; GFX900: [[META1]] = !{i32 5, i32 6} +;. +; GFX908: [[META0]] = !{} +; GFX908: [[META1]] = !{i32 5, i32 6} +;. +; GFX90A: [[META0]] = !{} +; GFX90A: [[META1]] = !{i32 5, i32 6} +;. +; GFX940: [[META0]] = !{} +; GFX940: [[META1]] = !{i32 5, i32 6} +;. +; GFX12: [[META0]] = !{} +; GFX12: [[META1]] = !{i32 5, i32 6} +;.