diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 06ca84e951487..9a353e0e01cf3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10212,7 +10212,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10748,7 +10748,35 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; - const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, + // SCC is already valid after SCCValid. + // SCCRedefine will redefine SCC to the same value already available after + // SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and + // update kill/dead flags if necessary. + const auto optimizeSCC = [this](MachineInstr *SCCValid, + MachineInstr *SCCRedefine) -> bool { + MachineInstr *KillsSCC = nullptr; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + + + dbgs() << "QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ\n"; + SCCValid->dump(); + + return true; + }; + + const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, optimizeSCC, this]() -> bool { if (CmpValue != 0) return false; @@ -10783,25 +10811,33 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } + if (!optimizeSCC(Def, &CmpInstr)) + return false; - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + // If s_or_32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of + // a register pair) and the input is a 64-bit foldableSelect then transform: + // + // (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64 + // (non-zero + // imm), 0) + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + MachineOperand OrOpnd1 = Def->getOperand(1); + MachineOperand OrOpnd2 = Def->getOperand(2); + + if (OrOpnd1.isReg() && OrOpnd2.isReg() && + OrOpnd1.getReg() != OrOpnd2.getReg()) { + auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI); + auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI); + if (Def1 == Def2 && foldableSelect(Def1)) + if (optimizeSCC(Def1, Def)) + dbgs() << "BBBBBBBBBBBBBBBBBBINGOOOOO!\n"; + } + } return true; }; - const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, + const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, optimizeSCC, this](int64_t ExpectedValue, unsigned SrcSize, bool IsReversible, bool IsSigned) -> bool { // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n @@ -10875,21 +10911,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 145ce9eca7f45..9095f98ac617e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1704,7 +1704,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 51df8c34cc55e..54b1554ae5d04 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7772,7 +7772,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31 @@ -7782,8 +7781,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX6-NEXT: s_sub_u32 s12, 0, s10 -; GFX6-NEXT: s_subb_u32 s13, 0, s11 +; GFX6-NEXT: s_sub_u32 s0, 0, s10 +; GFX6-NEXT: s_subb_u32 s1, 0, s11 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -7792,128 +7791,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_mul_i32 s15, s13, s0 -; GFX6-NEXT: s_mul_i32 s16, s12, s0 -; GFX6-NEXT: s_add_i32 s1, s17, s1 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 -; GFX6-NEXT: s_add_i32 s1, s1, s15 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 -; GFX6-NEXT: s_mul_i32 s17, s0, s1 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 -; GFX6-NEXT: s_add_u32 s15, s15, s17 -; GFX6-NEXT: v_readfirstlane_b32 s17, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s17 -; GFX6-NEXT: s_mul_i32 s16, s14, s16 -; GFX6-NEXT: v_readfirstlane_b32 s18, v4 -; GFX6-NEXT: s_add_u32 s15, s15, s16 -; GFX6-NEXT: s_addc_u32 s15, s17, s18 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: s_addc_u32 s16, s16, 0 -; GFX6-NEXT: s_mul_i32 s1, s14, s1 -; GFX6-NEXT: s_add_u32 s1, s15, s1 -; GFX6-NEXT: s_addc_u32 s15, 0, s16 -; GFX6-NEXT: s_add_u32 s16, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s14, s14, s15 -; GFX6-NEXT: s_mul_i32 s0, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_add_i32 s0, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s13, s16 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 -; GFX6-NEXT: s_add_i32 s0, s0, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s0 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_mul_i32 s1, s14, s1 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s1, s13, s1 -; GFX6-NEXT: s_addc_u32 s1, s15, s12 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 -; GFX6-NEXT: s_mul_i32 s0, s14, s0 -; GFX6-NEXT: s_add_u32 s0, s1, s0 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s15, s16, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s14, s14, s12 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s13, s0, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v2 +; GFX6-NEXT: s_mul_i32 s14, s1, s2 +; GFX6-NEXT: s_mul_i32 s15, s0, s2 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s15 +; GFX6-NEXT: s_add_i32 s13, s13, s14 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s15 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_mul_i32 s16, s2, s13 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s14, s14, s16 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s15, s12, s15 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s17, v4 +; GFX6-NEXT: s_add_u32 s14, s14, s15 +; GFX6-NEXT: s_addc_u32 s14, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 +; GFX6-NEXT: s_addc_u32 s15, s15, 0 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_add_u32 s13, s14, s13 +; GFX6-NEXT: s_addc_u32 s14, 0, s15 +; GFX6-NEXT: s_add_u32 s13, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, s14 +; GFX6-NEXT: s_mul_i32 s14, s0, s12 +; GFX6-NEXT: s_mul_i32 s1, s1, s13 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s0, s0, s13 +; GFX6-NEXT: s_add_i32 s1, s14, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mul_i32 s15, s13, s1 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s15, s17, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s0, s12, s0 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s0, s15, s0 +; GFX6-NEXT: s_addc_u32 s0, s16, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 +; GFX6-NEXT: s_add_u32 s0, s0, s1 +; GFX6-NEXT: s_addc_u32 s1, 0, s14 +; GFX6-NEXT: s_add_u32 s14, s13, s0 +; GFX6-NEXT: s_addc_u32 s15, s12, s1 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: s_addc_u32 s1, s7, s12 ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 -; GFX6-NEXT: s_mul_i32 s1, s6, s14 +; GFX6-NEXT: s_mul_i32 s1, s6, s15 ; GFX6-NEXT: v_readfirstlane_b32 s16, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: s_add_u32 s1, s16, s1 ; GFX6-NEXT: s_addc_u32 s4, 0, s4 -; GFX6-NEXT: s_mul_i32 s15, s7, s15 +; GFX6-NEXT: s_mul_i32 s14, s7, s14 ; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: s_add_u32 s1, s1, s15 +; GFX6-NEXT: s_add_u32 s1, s1, s14 ; GFX6-NEXT: s_addc_u32 s1, s4, s16 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s16, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_mul_i32 s14, s7, s15 +; GFX6-NEXT: s_add_u32 s14, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s4 +; GFX6-NEXT: s_addc_u32 s15, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s17 +; GFX6-NEXT: s_mul_i32 s4, s10, s15 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s16 -; GFX6-NEXT: s_add_i32 s18, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s18 -; GFX6-NEXT: s_mul_i32 s4, s10, s16 +; GFX6-NEXT: s_mul_i32 s5, s11, s14 +; GFX6-NEXT: s_add_i32 s16, s4, s5 +; GFX6-NEXT: s_sub_i32 s17, s7, s16 +; GFX6-NEXT: s_mul_i32 s4, s10, s14 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s4, s5 -; GFX6-NEXT: s_subb_u32 s19, s14, s11 -; GFX6-NEXT: s_sub_u32 s20, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_subb_u32 s17, s17, s11 +; GFX6-NEXT: s_sub_u32 s18, s6, s10 +; GFX6-NEXT: s_subb_u32 s17, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s11 ; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s20, s16, 2 -; GFX6-NEXT: s_addc_u32 s21, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s20, s15 -; GFX6-NEXT: s_cselect_b32 s15, s21, s19 +; GFX6-NEXT: s_cmp_ge_u32 s18, s10 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s11 +; GFX6-NEXT: s_cselect_b32 s17, s18, s19 +; GFX6-NEXT: s_add_u32 s18, s14, 1 +; GFX6-NEXT: s_addc_u32 s19, s15, 0 +; GFX6-NEXT: s_add_u32 s20, s14, 2 +; GFX6-NEXT: s_addc_u32 s21, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s17, s20, s18 +; GFX6-NEXT: s_cselect_b32 s18, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s18 +; GFX6-NEXT: s_subb_u32 s4, s7, s16 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 @@ -7921,13 +7913,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_cmp_eq_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s4, s6, s5 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s5, s15, s17 -; GFX6-NEXT: s_cselect_b32 s4, s14, s16 +; GFX6-NEXT: s_cselect_b32 s5, s18, s15 +; GFX6-NEXT: s_cselect_b32 s4, s17, s14 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 ; GFX6-NEXT: s_subb_u32 s5, s5, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -8278,8 +8271,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_u32 s14, 0, s6 -; GFX6-NEXT: s_subb_u32 s15, 0, s7 +; GFX6-NEXT: s_sub_u32 s12, 0, s6 +; GFX6-NEXT: s_subb_u32 s13, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8288,69 +8281,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_mul_i32 s13, s14, s16 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_mul_i32 s16, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s19, v2 -; GFX6-NEXT: s_mul_i32 s17, s15, s12 -; GFX6-NEXT: s_mul_i32 s18, s14, s12 -; GFX6-NEXT: s_add_i32 s13, s19, s13 +; GFX6-NEXT: s_mul_i32 s17, s13, s15 +; GFX6-NEXT: s_mul_i32 s18, s12, s15 +; GFX6-NEXT: s_add_i32 s16, s19, s16 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s18 -; GFX6-NEXT: s_add_i32 s13, s13, s17 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: s_add_i32 s16, s16, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, s18 ; GFX6-NEXT: v_readfirstlane_b32 s17, v3 -; GFX6-NEXT: s_mul_i32 s20, s12, s13 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_mul_i32 s20, s15, s16 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s16 ; GFX6-NEXT: s_add_u32 s17, s17, s20 ; GFX6-NEXT: v_readfirstlane_b32 s20, v0 -; GFX6-NEXT: s_mul_i32 s18, s16, s18 +; GFX6-NEXT: s_mul_i32 s18, s14, s18 ; GFX6-NEXT: s_addc_u32 s20, 0, s20 ; GFX6-NEXT: v_readfirstlane_b32 s19, v4 ; GFX6-NEXT: s_add_u32 s17, s17, s18 ; GFX6-NEXT: s_addc_u32 s17, s20, s19 ; GFX6-NEXT: v_readfirstlane_b32 s18, v1 ; GFX6-NEXT: s_addc_u32 s18, s18, 0 -; GFX6-NEXT: s_mul_i32 s13, s16, s13 -; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_add_u32 s16, s17, s16 ; GFX6-NEXT: s_addc_u32 s17, 0, s18 -; GFX6-NEXT: s_add_u32 s18, s12, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_addc_u32 s16, s16, s17 -; GFX6-NEXT: s_mul_i32 s12, s14, s16 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s15, s15, s18 -; GFX6-NEXT: s_mul_i32 s13, s14, s18 -; GFX6-NEXT: s_add_i32 s12, s12, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mul_hi_u32 v3, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s18, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s18, v0 -; GFX6-NEXT: s_mul_i32 s15, s18, s12 -; GFX6-NEXT: v_readfirstlane_b32 s19, v2 -; GFX6-NEXT: s_add_u32 s15, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s15, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_addc_u32 s14, s14, s17 +; GFX6-NEXT: s_mul_i32 s16, s12, s14 +; GFX6-NEXT: s_mul_i32 s13, s13, s15 ; GFX6-NEXT: v_readfirstlane_b32 s17, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s13 -; GFX6-NEXT: s_addc_u32 s17, 0, s17 -; GFX6-NEXT: v_readfirstlane_b32 s14, v3 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: s_addc_u32 s13, s17, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_addc_u32 s14, s14, 0 -; GFX6-NEXT: s_mul_i32 s12, s16, s12 -; GFX6-NEXT: s_add_u32 s12, s13, s12 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: s_add_u32 s15, s18, s12 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_addc_u32 s14, s16, s14 +; GFX6-NEXT: s_add_i32 s16, s17, s16 +; GFX6-NEXT: s_mul_i32 s12, s12, s15 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX6-NEXT: s_mul_i32 s17, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s19, v2 +; GFX6-NEXT: s_add_u32 s17, s19, s17 +; GFX6-NEXT: v_readfirstlane_b32 s18, v0 +; GFX6-NEXT: s_mul_i32 s12, s14, s12 +; GFX6-NEXT: s_addc_u32 s18, 0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v3 +; GFX6-NEXT: s_add_u32 s12, s17, s12 +; GFX6-NEXT: s_addc_u32 s12, s18, s16 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s13, s14, s13 +; GFX6-NEXT: s_add_u32 s12, s12, s13 +; GFX6-NEXT: s_addc_u32 s13, 0, s16 +; GFX6-NEXT: s_add_u32 s15, s15, s12 +; GFX6-NEXT: s_addc_u32 s14, s14, s13 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 @@ -8374,40 +8363,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s18, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s17, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s19 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s18 -; GFX6-NEXT: s_add_i32 s20, s14, s15 -; GFX6-NEXT: s_sub_i32 s16, s9, s20 -; GFX6-NEXT: s_mul_i32 s14, s6, s18 +; GFX6-NEXT: s_mul_i32 s15, s7, s17 +; GFX6-NEXT: s_add_i32 s18, s14, s15 +; GFX6-NEXT: s_sub_i32 s19, s9, s18 +; GFX6-NEXT: s_mul_i32 s14, s6, s17 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s14, s15 -; GFX6-NEXT: s_subb_u32 s21, s16, s7 -; GFX6-NEXT: s_sub_u32 s22, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s16, s17 -; GFX6-NEXT: s_subb_u32 s16, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_subb_u32 s19, s19, s7 +; GFX6-NEXT: s_sub_u32 s20, s8, s6 +; GFX6-NEXT: s_subb_u32 s19, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s16, s21, s17 -; GFX6-NEXT: s_add_u32 s17, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s16, s22, s17 -; GFX6-NEXT: s_cselect_b32 s17, s23, s21 +; GFX6-NEXT: s_cmp_ge_u32 s20, s6 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s19, s20, s21 +; GFX6-NEXT: s_add_u32 s20, s17, 1 +; GFX6-NEXT: s_addc_u32 s21, s16, 0 +; GFX6-NEXT: s_add_u32 s22, s17, 2 +; GFX6-NEXT: s_addc_u32 s23, s16, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s19, s22, s20 +; GFX6-NEXT: s_cselect_b32 s20, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s9, s9, s20 +; GFX6-NEXT: s_subb_u32 s9, s9, s18 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 ; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 @@ -8415,12 +8401,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s17, s19 -; GFX6-NEXT: s_cselect_b32 s6, s16, s18 +; GFX6-NEXT: s_cselect_b32 s7, s20, s16 +; GFX6-NEXT: s_cselect_b32 s6, s19, s17 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s16, s6, s2 -; GFX6-NEXT: s_subb_u32 s17, s7, s3 +; GFX6-NEXT: s_sub_u32 s14, s6, s2 +; GFX6-NEXT: s_subb_u32 s15, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8428,8 +8414,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_u32 s12, 0, s8 -; GFX6-NEXT: s_subb_u32 s13, 0, s9 +; GFX6-NEXT: s_sub_u32 s2, 0, s8 +; GFX6-NEXT: s_subb_u32 s3, 0, s9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8438,128 +8424,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s0, s13, s2 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s12, s2 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: s_mul_i32 s4, s2, s3 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s13, s2, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v2 +; GFX6-NEXT: s_mul_i32 s1, s3, s0 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: s_add_i32 s13, s13, s1 +; GFX6-NEXT: s_mul_i32 s1, s2, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: s_mul_i32 s16, s0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 -; GFX6-NEXT: s_add_u32 s4, s18, s4 -; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s16, s18, s16 +; GFX6-NEXT: s_addc_u32 s17, 0, s17 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 -; GFX6-NEXT: s_addc_u32 s4, s5, s18 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 -; GFX6-NEXT: s_add_u32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s4, 0, s5 -; GFX6-NEXT: s_add_u32 s5, s2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 -; GFX6-NEXT: s_mul_i32 s2, s12, s4 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_add_i32 s2, s3, s2 -; GFX6-NEXT: s_mul_i32 s13, s13, s5 -; GFX6-NEXT: s_mul_i32 s3, s12, s5 -; GFX6-NEXT: s_add_i32 s2, s2, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_add_u32 s1, s16, s1 +; GFX6-NEXT: s_addc_u32 s1, s17, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_addc_u32 s13, 0, s16 +; GFX6-NEXT: s_add_u32 s16, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_addc_u32 s4, s12, s13 +; GFX6-NEXT: s_mul_i32 s5, s2, s4 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s5, s12, s5 +; GFX6-NEXT: s_mul_i32 s3, s3, s16 +; GFX6-NEXT: s_mul_i32 s2, s2, s16 +; GFX6-NEXT: s_add_i32 s3, s5, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 +; GFX6-NEXT: s_mul_i32 s12, s16, s3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s12, s17, s12 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 -; GFX6-NEXT: s_add_u32 s2, s3, s2 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s13, s5, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s12, s4, s12 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s5, v3 +; GFX6-NEXT: s_add_u32 s2, s12, s2 +; GFX6-NEXT: s_addc_u32 s2, s13, s5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_add_u32 s2, s2, s3 +; GFX6-NEXT: s_addc_u32 s3, 0, s5 +; GFX6-NEXT: s_add_u32 s12, s16, s2 +; GFX6-NEXT: s_addc_u32 s13, s4, s3 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 -; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s2, s10, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s15, s2 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v1 -; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s14, s15 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_add_u32 s2, s17, s2 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s18, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: s_add_u32 s2, s2, s12 +; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s13, s11, s13 +; GFX6-NEXT: s_add_u32 s16, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s19 +; GFX6-NEXT: s_addc_u32 s17, 0, s12 +; GFX6-NEXT: s_mul_i32 s12, s8, s17 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s18 -; GFX6-NEXT: s_add_i32 s20, s12, s13 -; GFX6-NEXT: s_sub_i32 s14, s11, s20 -; GFX6-NEXT: s_mul_i32 s12, s8, s18 +; GFX6-NEXT: s_mul_i32 s13, s9, s16 +; GFX6-NEXT: s_add_i32 s18, s12, s13 +; GFX6-NEXT: s_sub_i32 s19, s11, s18 +; GFX6-NEXT: s_mul_i32 s12, s8, s16 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s21, s14, s9 -; GFX6-NEXT: s_sub_u32 s22, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_subb_u32 s19, s19, s9 +; GFX6-NEXT: s_sub_u32 s20, s10, s8 +; GFX6-NEXT: s_subb_u32 s19, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s9 ; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_add_u32 s15, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s22, s15 -; GFX6-NEXT: s_cselect_b32 s15, s23, s21 +; GFX6-NEXT: s_cmp_ge_u32 s20, s8 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s9 +; GFX6-NEXT: s_cselect_b32 s19, s20, s21 +; GFX6-NEXT: s_add_u32 s20, s16, 1 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_add_u32 s22, s16, 2 +; GFX6-NEXT: s_addc_u32 s23, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s19, s22, s20 +; GFX6-NEXT: s_cselect_b32 s20, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s11, s11, s20 +; GFX6-NEXT: s_subb_u32 s11, s11, s18 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 ; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 @@ -8567,15 +8546,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s15, s19 -; GFX6-NEXT: s_cselect_b32 s8, s14, s18 +; GFX6-NEXT: s_cselect_b32 s9, s20, s17 +; GFX6-NEXT: s_cselect_b32 s8, s19, s16 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9015,105 +8994,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_u32 s10, 0, s8 -; GFX6-NEXT: s_subb_u32 s11, 0, s9 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s10, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s1, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_mul_i32 s13, s11, s0 -; GFX6-NEXT: s_mul_i32 s14, s10, s0 -; GFX6-NEXT: s_add_i32 s1, s15, s1 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s14 -; GFX6-NEXT: s_add_i32 s1, s1, s13 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, s14 -; GFX6-NEXT: v_readfirstlane_b32 s13, v3 -; GFX6-NEXT: s_mul_i32 s15, s0, s1 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 -; GFX6-NEXT: s_add_u32 s13, s13, s15 -; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: s_mul_i32 s14, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s16, v4 -; GFX6-NEXT: s_add_u32 s13, s13, s14 -; GFX6-NEXT: s_addc_u32 s13, s15, s16 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_addc_u32 s14, s14, 0 -; GFX6-NEXT: s_mul_i32 s1, s12, s1 -; GFX6-NEXT: s_add_u32 s1, s13, s1 -; GFX6-NEXT: s_addc_u32 s13, 0, s14 -; GFX6-NEXT: s_add_u32 s14, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s12, s12, s13 -; GFX6-NEXT: s_mul_i32 s0, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_add_i32 s0, s1, s0 -; GFX6-NEXT: s_mul_i32 s11, s11, s14 -; GFX6-NEXT: s_mul_i32 s1, s10, s14 -; GFX6-NEXT: s_add_i32 s0, s0, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 -; GFX6-NEXT: s_mul_i32 s11, s14, s0 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s11, s15, s11 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s1 -; GFX6-NEXT: s_addc_u32 s13, 0, s13 -; GFX6-NEXT: v_readfirstlane_b32 s10, v3 -; GFX6-NEXT: s_add_u32 s1, s11, s1 -; GFX6-NEXT: s_addc_u32 s1, s13, s10 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s0, s12, s0 -; GFX6-NEXT: s_add_u32 s0, s1, s0 -; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_add_u32 s13, s14, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s12, s12, s10 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s11, s0, s10 +; GFX6-NEXT: v_readfirstlane_b32 s14, v2 +; GFX6-NEXT: s_mul_i32 s12, s1, s2 +; GFX6-NEXT: s_mul_i32 s13, s0, s2 +; GFX6-NEXT: s_add_i32 s11, s14, s11 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s13 +; GFX6-NEXT: s_add_i32 s11, s11, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s13 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_mul_i32 s14, s2, s11 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s11 +; GFX6-NEXT: s_add_u32 s12, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s13, s10, s13 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s15, v4 +; GFX6-NEXT: s_add_u32 s12, s12, s13 +; GFX6-NEXT: s_addc_u32 s12, s14, s15 +; GFX6-NEXT: v_readfirstlane_b32 s13, v1 +; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_mul_i32 s11, s10, s11 +; GFX6-NEXT: s_add_u32 s11, s12, s11 +; GFX6-NEXT: s_addc_u32 s12, 0, s13 +; GFX6-NEXT: s_add_u32 s11, s2, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_addc_u32 s10, s10, s12 +; GFX6-NEXT: s_mul_i32 s12, s0, s10 +; GFX6-NEXT: s_mul_i32 s1, s1, s11 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_add_i32 s12, s13, s12 +; GFX6-NEXT: s_mul_i32 s0, s0, s11 +; GFX6-NEXT: s_add_i32 s1, s12, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: s_mul_i32 s13, s11, s1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s0, s10, s0 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_add_u32 s0, s13, s0 +; GFX6-NEXT: s_addc_u32 s0, s14, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s1, s10, s1 +; GFX6-NEXT: s_add_u32 s0, s0, s1 +; GFX6-NEXT: s_addc_u32 s1, 0, s12 +; GFX6-NEXT: s_add_u32 s12, s11, s0 +; GFX6-NEXT: s_addc_u32 s13, s10, s1 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 ; GFX6-NEXT: s_mov_b32 s11, s10 ; GFX6-NEXT: s_addc_u32 s1, s7, s10 ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 -; GFX6-NEXT: s_mul_i32 s1, s6, s12 +; GFX6-NEXT: s_mul_i32 s1, s6, s13 ; GFX6-NEXT: v_readfirstlane_b32 s14, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: s_add_u32 s1, s14, s1 ; GFX6-NEXT: s_addc_u32 s4, 0, s4 -; GFX6-NEXT: s_mul_i32 s13, s7, s13 +; GFX6-NEXT: s_mul_i32 s12, s7, s12 ; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_add_u32 s1, s1, s12 ; GFX6-NEXT: s_addc_u32 s1, s4, s14 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_mul_i32 s12, s7, s12 +; GFX6-NEXT: s_mul_i32 s12, s7, s13 ; GFX6-NEXT: s_add_u32 s12, s1, s12 ; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 @@ -9128,11 +9102,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s4, s5 ; GFX6-NEXT: s_subb_u32 s15, s13, s9 ; GFX6-NEXT: s_sub_u32 s16, s6, s8 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s12, s13 ; GFX6-NEXT: s_subb_u32 s17, s15, 0 ; GFX6-NEXT: s_cmp_ge_u32 s17, s9 ; GFX6-NEXT: s_cselect_b32 s18, -1, 0 @@ -9141,13 +9113,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_cmp_eq_u32 s17, s9 ; GFX6-NEXT: s_cselect_b32 s18, s19, s18 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s15, s15, s9 -; GFX6-NEXT: s_sub_u32 s19, s16, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_subb_u32 s12, s15, s9 +; GFX6-NEXT: s_sub_u32 s13, s16, s8 +; GFX6-NEXT: s_subb_u32 s12, s12, 0 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s13, s13, s16 ; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_subb_u32 s4, s7, s14 @@ -9164,6 +9134,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -9405,8 +9376,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s12, 0, s2 -; GFX6-NEXT: s_subb_u32 s13, 0, s3 +; GFX6-NEXT: s_sub_u32 s6, 0, s2 +; GFX6-NEXT: s_subb_u32 s7, 0, s3 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9415,69 +9386,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s7, s12, s14 +; GFX6-NEXT: v_mul_hi_u32 v2, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_mul_i32 s14, s6, s12 ; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_mul_i32 s15, s13, s6 -; GFX6-NEXT: s_mul_i32 s16, s12, s6 -; GFX6-NEXT: s_add_i32 s7, s17, s7 +; GFX6-NEXT: s_mul_i32 s15, s7, s13 +; GFX6-NEXT: s_mul_i32 s16, s6, s13 +; GFX6-NEXT: s_add_i32 s14, s17, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 -; GFX6-NEXT: s_add_i32 s7, s7, s15 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s7 +; GFX6-NEXT: s_add_i32 s14, s14, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v3 -; GFX6-NEXT: s_mul_i32 s18, s6, s7 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s7 +; GFX6-NEXT: s_mul_i32 s18, s13, s14 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s14 ; GFX6-NEXT: s_add_u32 s15, s15, s18 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_mul_i32 s16, s12, s16 ; GFX6-NEXT: s_addc_u32 s18, 0, s18 ; GFX6-NEXT: v_readfirstlane_b32 s17, v4 ; GFX6-NEXT: s_add_u32 s15, s15, s16 ; GFX6-NEXT: s_addc_u32 s15, s18, s17 ; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 -; GFX6-NEXT: s_mul_i32 s7, s14, s7 -; GFX6-NEXT: s_add_u32 s7, s15, s7 +; GFX6-NEXT: s_mul_i32 s14, s12, s14 +; GFX6-NEXT: s_add_u32 s14, s15, s14 ; GFX6-NEXT: s_addc_u32 s15, 0, s16 -; GFX6-NEXT: s_add_u32 s16, s6, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_addc_u32 s14, s14, s15 -; GFX6-NEXT: s_mul_i32 s6, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_add_i32 s6, s7, s6 -; GFX6-NEXT: s_mul_i32 s13, s13, s16 -; GFX6-NEXT: s_mul_i32 s7, s12, s16 -; GFX6-NEXT: s_add_i32 s6, s6, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s6 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_add_u32 s13, s13, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, s15 +; GFX6-NEXT: s_mul_i32 s14, s6, s12 +; GFX6-NEXT: s_mul_i32 s7, s7, s13 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_mul_i32 s7, s14, s7 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s7, s13, s7 -; GFX6-NEXT: s_addc_u32 s7, s15, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 -; GFX6-NEXT: s_mul_i32 s6, s14, s6 -; GFX6-NEXT: s_add_u32 s6, s7, s6 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s13, s16, s6 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_addc_u32 s12, s14, s12 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s6, s6, s13 +; GFX6-NEXT: s_add_i32 s7, s14, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mul_i32 s15, s13, s7 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s15, s17, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s6, s12, s6 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s6, s15, s6 +; GFX6-NEXT: s_addc_u32 s6, s16, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s7, s12, s7 +; GFX6-NEXT: s_add_u32 s6, s6, s7 +; GFX6-NEXT: s_addc_u32 s7, 0, s14 +; GFX6-NEXT: s_add_u32 s13, s13, s6 +; GFX6-NEXT: s_addc_u32 s12, s12, s7 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -9514,11 +9481,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 ; GFX6-NEXT: s_subb_u32 s17, s14, s3 ; GFX6-NEXT: s_sub_u32 s18, s8, s2 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s14, s15 ; GFX6-NEXT: s_subb_u32 s19, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s19, s3 ; GFX6-NEXT: s_cselect_b32 s20, -1, 0 @@ -9527,13 +9492,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s19, s3 ; GFX6-NEXT: s_cselect_b32 s20, s21, s20 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s17, s17, s3 -; GFX6-NEXT: s_sub_u32 s21, s18, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_subb_u32 s14, s17, s3 +; GFX6-NEXT: s_sub_u32 s15, s18, s2 +; GFX6-NEXT: s_subb_u32 s14, s14, 0 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s15, s15, s18 ; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 ; GFX6-NEXT: s_subb_u32 s9, s9, s16 @@ -9556,8 +9519,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_u32 s8, 0, s6 -; GFX6-NEXT: s_subb_u32 s9, 0, s7 +; GFX6-NEXT: s_sub_u32 s2, 0, s6 +; GFX6-NEXT: s_subb_u32 s3, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9566,70 +9529,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s12 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s0, s9, s2 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s8, s2 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: s_mul_i32 s4, s2, s3 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s9, s2, s8 +; GFX6-NEXT: v_readfirstlane_b32 s12, v2 +; GFX6-NEXT: s_mul_i32 s1, s3, s0 +; GFX6-NEXT: s_add_i32 s9, s12, s9 +; GFX6-NEXT: s_add_i32 s9, s9, s1 +; GFX6-NEXT: s_mul_i32 s1, s2, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s9 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: s_mul_i32 s12, s0, s9 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 -; GFX6-NEXT: s_add_u32 s4, s16, s4 -; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s9 +; GFX6-NEXT: s_add_u32 s12, s16, s12 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: s_mul_i32 s1, s8, s1 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s13 -; GFX6-NEXT: s_addc_u32 s4, s5, s16 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s12, s3 -; GFX6-NEXT: s_add_u32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s4, 0, s5 -; GFX6-NEXT: s_add_u32 s5, s2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s12, s4 -; GFX6-NEXT: s_mul_i32 s2, s8, s4 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_add_i32 s2, s3, s2 -; GFX6-NEXT: s_mul_i32 s9, s9, s5 -; GFX6-NEXT: s_mul_i32 s3, s8, s5 -; GFX6-NEXT: s_add_i32 s2, s2, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_add_u32 s1, s12, s1 +; GFX6-NEXT: s_addc_u32 s1, s13, s16 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s9, s8, s9 +; GFX6-NEXT: s_add_u32 s1, s1, s9 +; GFX6-NEXT: s_addc_u32 s9, 0, s12 +; GFX6-NEXT: s_add_u32 s12, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_addc_u32 s4, s8, s9 +; GFX6-NEXT: s_mul_i32 s5, s2, s4 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: s_add_i32 s5, s8, s5 +; GFX6-NEXT: s_mul_i32 s3, s3, s12 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_add_i32 s3, s5, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mul_i32 s9, s5, s2 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_mul_i32 s8, s12, s3 ; GFX6-NEXT: v_readfirstlane_b32 s13, v2 -; GFX6-NEXT: s_add_u32 s9, s13, s9 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: v_readfirstlane_b32 s8, v3 -; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s12, s8 -; GFX6-NEXT: v_readfirstlane_b32 s8, v1 -; GFX6-NEXT: s_addc_u32 s8, s8, 0 +; GFX6-NEXT: s_add_u32 s8, s13, s8 +; GFX6-NEXT: v_readfirstlane_b32 s9, v0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 -; GFX6-NEXT: s_add_u32 s2, s3, s2 -; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s12, s5, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s13, s4, s8 +; GFX6-NEXT: s_addc_u32 s9, 0, s9 +; GFX6-NEXT: v_readfirstlane_b32 s5, v3 +; GFX6-NEXT: s_add_u32 s2, s8, s2 +; GFX6-NEXT: s_addc_u32 s2, s9, s5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_add_u32 s2, s2, s3 +; GFX6-NEXT: s_addc_u32 s3, 0, s5 +; GFX6-NEXT: s_add_u32 s12, s12, s2 +; GFX6-NEXT: s_addc_u32 s13, s4, s3 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 @@ -9667,11 +9626,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s13, s10, s11 ; GFX6-NEXT: s_subb_u32 s17, s12, s7 ; GFX6-NEXT: s_sub_u32 s18, s8, s6 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s12, s13 ; GFX6-NEXT: s_subb_u32 s19, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s20, -1, 0 @@ -9680,13 +9637,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s20, s21, s20 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s17, s7 -; GFX6-NEXT: s_sub_u32 s21, s18, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_subb_u32 s12, s17, s7 +; GFX6-NEXT: s_sub_u32 s13, s18, s6 +; GFX6-NEXT: s_subb_u32 s12, s12, 0 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s13, s13, s18 ; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 ; GFX6-NEXT: s_subb_u32 s9, s9, s16 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 2904bdbbdda3d..738a860c7bca0 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_add_u32 s4, s4, s6 -; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CISI-NEXT: s_or_b32 s6, s12, s13 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_sub_u32 s4, s4, s6 -; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CISI-NEXT: s_or_b32 s6, s12, s13 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index dbdea8e3c533d..71af21a11c2ce 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s7, s6, s6 -; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -88,15 +86,13 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s2, s2 -; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_addc_u32 s0, s2, 0 +; GFX7-NEXT: s_add_u32 s1, s0, s0 +; GFX7-NEXT: s_addc_u32 s0, s0, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 71f5a94a7f245..74a6d7fe39362 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s8, s1, 31 ; GCN-NEXT: s_add_u32 s0, s0, s8 @@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GCN-NEXT: s_sub_u32 s12, 0, s10 -; GCN-NEXT: s_subb_u32 s13, 0, s11 +; GCN-NEXT: s_sub_u32 s0, 0, s10 +; GCN-NEXT: s_subb_u32 s1, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s17, v2 -; GCN-NEXT: s_mul_i32 s15, s13, s0 -; GCN-NEXT: s_mul_i32 s16, s12, s0 -; GCN-NEXT: s_add_i32 s1, s17, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s16 -; GCN-NEXT: s_add_i32 s1, s1, s15 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s16 -; GCN-NEXT: v_readfirstlane_b32 s15, v3 -; GCN-NEXT: s_mul_i32 s17, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s15, s15, s17 -; GCN-NEXT: v_readfirstlane_b32 s17, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s17 -; GCN-NEXT: s_mul_i32 s16, s14, s16 -; GCN-NEXT: v_readfirstlane_b32 s18, v4 -; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_addc_u32 s15, s17, s18 -; GCN-NEXT: v_readfirstlane_b32 s16, v1 -; GCN-NEXT: s_addc_u32 s16, s16, 0 -; GCN-NEXT: s_mul_i32 s1, s14, s1 -; GCN-NEXT: s_add_u32 s1, s15, s1 -; GCN-NEXT: s_addc_u32 s15, 0, s16 -; GCN-NEXT: s_add_u32 s16, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s14, s14, s15 -; GCN-NEXT: s_mul_i32 s0, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s1, s12, s16 -; GCN-NEXT: s_add_i32 s0, s0, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s16, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s14, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s16, v0 -; GCN-NEXT: s_mul_i32 s13, s16, s0 -; GCN-NEXT: v_readfirstlane_b32 s17, v2 -; GCN-NEXT: s_add_u32 s13, s17, s13 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s1, s14, s1 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s12, v3 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s1, s15, s12 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s0, s14, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s15, s16, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s14, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s13, s0, s12 +; GCN-NEXT: v_readfirstlane_b32 s16, v2 +; GCN-NEXT: s_mul_i32 s14, s1, s2 +; GCN-NEXT: s_mul_i32 s15, s0, s2 +; GCN-NEXT: s_add_i32 s13, s16, s13 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s15 +; GCN-NEXT: s_add_i32 s13, s13, s14 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s13 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s15 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: s_mul_i32 s16, s2, s13 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s13 +; GCN-NEXT: s_add_u32 s14, s14, s16 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s15, s12, s15 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: v_readfirstlane_b32 s17, v4 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s14, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s13, s12, s13 +; GCN-NEXT: s_add_u32 s13, s14, s13 +; GCN-NEXT: s_addc_u32 s14, 0, s15 +; GCN-NEXT: s_add_u32 s13, s2, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s14, s0, s12 +; GCN-NEXT: s_mul_i32 s1, s1, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s0, s0, s13 +; GCN-NEXT: s_add_i32 s1, s14, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 +; GCN-NEXT: s_mul_i32 s15, s13, s1 +; GCN-NEXT: v_readfirstlane_b32 s17, v2 +; GCN-NEXT: s_add_u32 s15, s17, s15 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s0, s12, s0 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: s_add_u32 s0, s15, s0 +; GCN-NEXT: s_addc_u32 s0, s16, s14 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s14 +; GCN-NEXT: s_add_u32 s14, s13, s0 +; GCN-NEXT: s_addc_u32 s15, s12, s1 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 ; GCN-NEXT: s_mov_b32 s13, s12 ; GCN-NEXT: s_addc_u32 s1, s7, s12 ; GCN-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 -; GCN-NEXT: s_mul_i32 s1, s6, s14 +; GCN-NEXT: s_mul_i32 s1, s6, s15 ; GCN-NEXT: v_readfirstlane_b32 s16, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 ; GCN-NEXT: s_add_u32 s1, s16, s1 ; GCN-NEXT: s_addc_u32 s4, 0, s4 -; GCN-NEXT: s_mul_i32 s15, s7, s15 +; GCN-NEXT: s_mul_i32 s14, s7, s14 ; GCN-NEXT: v_readfirstlane_b32 s16, v1 -; GCN-NEXT: s_add_u32 s1, s1, s15 +; GCN-NEXT: s_add_u32 s1, s1, s14 ; GCN-NEXT: s_addc_u32 s1, s4, s16 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 -; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s16, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: s_mul_i32 s14, s7, s15 +; GCN-NEXT: s_add_u32 s14, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s4 +; GCN-NEXT: s_addc_u32 s15, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s17 +; GCN-NEXT: s_mul_i32 s4, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s16 -; GCN-NEXT: s_add_i32 s18, s4, s5 -; GCN-NEXT: s_sub_i32 s14, s7, s18 -; GCN-NEXT: s_mul_i32 s4, s10, s16 +; GCN-NEXT: s_mul_i32 s5, s11, s14 +; GCN-NEXT: s_add_i32 s16, s4, s5 +; GCN-NEXT: s_sub_i32 s17, s7, s16 +; GCN-NEXT: s_mul_i32 s4, s10, s14 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s15, s4, s5 -; GCN-NEXT: s_subb_u32 s19, s14, s11 -; GCN-NEXT: s_sub_u32 s20, s6, s10 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_or_b32 s14, s14, s15 -; GCN-NEXT: s_subb_u32 s14, s19, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_subb_u32 s17, s17, s11 +; GCN-NEXT: s_sub_u32 s18, s6, s10 +; GCN-NEXT: s_subb_u32 s17, s17, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s11 ; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s14, s19, s15 -; GCN-NEXT: s_add_u32 s15, s16, 1 -; GCN-NEXT: s_addc_u32 s19, s17, 0 -; GCN-NEXT: s_add_u32 s20, s16, 2 -; GCN-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s14, s20, s15 -; GCN-NEXT: s_cselect_b32 s15, s21, s19 +; GCN-NEXT: s_cmp_ge_u32 s18, s10 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s17, s11 +; GCN-NEXT: s_cselect_b32 s17, s18, s19 +; GCN-NEXT: s_add_u32 s18, s14, 1 +; GCN-NEXT: s_addc_u32 s19, s15, 0 +; GCN-NEXT: s_add_u32 s20, s14, 2 +; GCN-NEXT: s_addc_u32 s21, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s17, s20, s18 +; GCN-NEXT: s_cselect_b32 s18, s21, s19 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s18 +; GCN-NEXT: s_subb_u32 s4, s7, s16 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s10 @@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s4, s6, s5 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s15, s17 -; GCN-NEXT: s_cselect_b32 s4, s14, s16 +; GCN-NEXT: s_cselect_b32 s5, s18, s15 +; GCN-NEXT: s_cselect_b32 s4, s17, s14 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 ; GCN-NEXT: s_subb_u32 s5, s5, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s2, 0, s6 -; GCN-NEXT: s_subb_u32 s10, 0, s7 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s8, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v2 -; GCN-NEXT: s_mul_i32 s12, s10, s8 -; GCN-NEXT: s_mul_i32 s13, s2, s8 -; GCN-NEXT: s_add_i32 s9, s14, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 -; GCN-NEXT: s_add_i32 s9, s9, s12 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 -; GCN-NEXT: s_add_u32 s12, s12, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s14, v4 -; GCN-NEXT: s_add_u32 s12, s12, s13 -; GCN-NEXT: s_addc_u32 s12, s15, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s9, s11, s9 -; GCN-NEXT: s_add_u32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s12, 0, s13 -; GCN-NEXT: s_add_u32 s13, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s10, s2, s9 +; GCN-NEXT: v_readfirstlane_b32 s13, v2 +; GCN-NEXT: s_mul_i32 s11, s8, s3 +; GCN-NEXT: s_mul_i32 s12, s2, s3 +; GCN-NEXT: s_add_i32 s10, s13, s10 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 +; GCN-NEXT: s_add_i32 s10, s10, s11 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s10 +; GCN-NEXT: s_mul_i32 s14, s3, s10 +; GCN-NEXT: s_add_u32 s11, s11, s14 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s12, s9, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v4 +; GCN-NEXT: s_add_u32 s11, s11, s12 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: s_addc_u32 s11, s14, s13 +; GCN-NEXT: s_addc_u32 s12, s15, 0 +; GCN-NEXT: s_mul_i32 s10, s9, s10 +; GCN-NEXT: s_add_u32 s10, s11, s10 +; GCN-NEXT: s_addc_u32 s11, 0, s12 +; GCN-NEXT: s_add_u32 s10, s3, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s11, s11, s12 -; GCN-NEXT: s_mul_i32 s8, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s10, s10, s13 -; GCN-NEXT: s_mul_i32 s2, s2, s13 -; GCN-NEXT: s_add_i32 s8, s8, s10 +; GCN-NEXT: s_addc_u32 s9, s9, s11 +; GCN-NEXT: s_mul_i32 s11, s2, s9 +; GCN-NEXT: s_mul_i32 s8, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_add_i32 s11, s12, s11 +; GCN-NEXT: s_mul_i32 s2, s2, s10 +; GCN-NEXT: s_add_i32 s8, s11, s8 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 -; GCN-NEXT: s_mul_i32 s10, s13, s8 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_mul_i32 s12, s10, s8 ; GCN-NEXT: v_readfirstlane_b32 s14, v2 -; GCN-NEXT: s_add_u32 s10, s14, s10 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s2, s11, s2 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NEXT: s_add_u32 s2, s10, s2 -; GCN-NEXT: s_addc_u32 s2, s12, s9 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mul_i32 s8, s11, s8 +; GCN-NEXT: s_add_u32 s12, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s2, s9, s2 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: s_add_u32 s2, s12, s2 +; GCN-NEXT: s_addc_u32 s2, s13, s11 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s8, s9, s8 ; GCN-NEXT: s_add_u32 s2, s2, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s9 -; GCN-NEXT: s_add_u32 s2, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s8, s11, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s11 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s8, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 ; GCN-NEXT: s_mul_i32 s8, s8, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s12, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s12 +; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s13, s9, s8 -; GCN-NEXT: s_sub_i32 s10, 0, s13 -; GCN-NEXT: s_mul_i32 s8, s6, s12 -; GCN-NEXT: s_sub_u32 s14, 24, s8 +; GCN-NEXT: s_add_i32 s11, s9, s8 +; GCN-NEXT: s_sub_i32 s12, 0, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s10 +; GCN-NEXT: s_sub_u32 s13, 24, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s11, s8, s9 -; GCN-NEXT: s_subb_u32 s15, s10, s7 -; GCN-NEXT: s_sub_u32 s16, s14, s6 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s6 +; GCN-NEXT: s_subb_u32 s12, s12, s7 +; GCN-NEXT: s_sub_u32 s14, s13, s6 +; GCN-NEXT: s_subb_u32 s12, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s7 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s10, s15, s11 -; GCN-NEXT: s_add_u32 s11, s12, 1 +; GCN-NEXT: s_cmp_ge_u32 s14, s6 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s12, s7 +; GCN-NEXT: s_cselect_b32 s12, s14, s15 +; GCN-NEXT: s_add_u32 s14, s10, 1 ; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_add_u32 s16, s12, 2 +; GCN-NEXT: s_add_u32 s16, s10, 2 ; GCN-NEXT: s_addc_u32 s17, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_cselect_b32 s10, s16, s11 -; GCN-NEXT: s_cselect_b32 s11, s17, s15 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_cselect_b32 s12, s16, s14 +; GCN-NEXT: s_cselect_b32 s14, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, 0, s13 +; GCN-NEXT: s_subb_u32 s8, 0, s11 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s6 +; GCN-NEXT: s_cmp_ge_u32 s13, s6 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s6, s6, s9 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s11, 0 -; GCN-NEXT: s_cselect_b32 s6, s10, s12 +; GCN-NEXT: s_cselect_b32 s7, s14, 0 +; GCN-NEXT: s_cselect_b32 s6, s12, s10 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ea9bb0417dfa4..862e2dd2de051 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-NEXT: s_sub_u32 s10, 0, s4 -; GCN-NEXT: s_subb_u32 s11, 0, s5 +; GCN-NEXT: s_sub_u32 s8, 0, s4 +; GCN-NEXT: s_subb_u32 s9, 0, s5 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s8 -; GCN-NEXT: s_mul_i32 s14, s10, s8 -; GCN-NEXT: s_add_i32 s9, s15, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s9, s9, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: s_add_u32 s13, s13, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: v_mul_hi_u32 v0, v1, s9 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: s_mul_i32 s14, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s9, s2 +; GCN-NEXT: s_mul_i32 s13, s8, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s14, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_add_u32 s9, s13, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s8, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s9, s10, s14 -; GCN-NEXT: s_add_i32 s8, s8, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s8 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s8, s10 +; GCN-NEXT: s_mul_i32 s9, s9, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s9, s11, s9 -; GCN-NEXT: s_addc_u32 s9, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s8, s12, s8 -; GCN-NEXT: s_add_u32 s8, s9, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s10, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_add_i32 s9, s12, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s8, s10, s8 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s8, s13, s8 +; GCN-NEXT: s_addc_u32 s8, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s9, s10, s9 +; GCN-NEXT: s_add_u32 s8, s8, s9 +; GCN-NEXT: s_addc_u32 s9, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s8 +; GCN-NEXT: s_addc_u32 s10, s10, s9 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_addc_u32 s11, 0, s12 ; GCN-NEXT: s_mul_i32 s11, s4, s11 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 @@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s13, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s12, s5 ; GCN-NEXT: s_sub_u32 s16, s6, s4 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s17, s12, s13 ; GCN-NEXT: s_subb_u32 s17, s15, 0 ; GCN-NEXT: s_cmp_ge_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, -1, 0 @@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_cmp_eq_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, s19, s18 ; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s15, s15, s5 -; GCN-NEXT: s_sub_u32 s19, s16, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_subb_u32 s12, s15, s5 +; GCN-NEXT: s_sub_u32 s13, s16, s4 +; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s13, s13, s16 ; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 ; GCN-NEXT: s_subb_u32 s7, s7, s14 @@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_u32 s14, s14, s20 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s21 ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GCN-NEXT: s_sub_u32 s2, 0, s4 -; GCN-NEXT: s_subb_u32 s8, 0, s5 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s6, 0, s5 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s7, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s6 -; GCN-NEXT: s_mul_i32 s11, s2, s6 -; GCN-NEXT: s_add_i32 s7, s12, s7 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s7, s7, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_mul_i32 s13, s6, s7 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s7, s9, s7 -; GCN-NEXT: s_add_u32 s7, s10, s7 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s8, s2, s7 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s6, s3 +; GCN-NEXT: s_mul_i32 s10, s2, s3 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_mul_i32 s12, s3, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s7, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_mul_i32 s8, s7, s8 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s3, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s6, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s7, v0 -; GCN-NEXT: s_add_i32 s6, s7, s6 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s2, s2, s11 -; GCN-NEXT: s_add_i32 s6, s6, s8 +; GCN-NEXT: s_addc_u32 s7, s7, s9 +; GCN-NEXT: s_mul_i32 s9, s2, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s2, s2, s8 +; GCN-NEXT: s_add_i32 s6, s9, s6 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s6 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s6 ; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s2, s9, s2 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s7, v3 -; GCN-NEXT: s_add_u32 s2, s8, s2 -; GCN-NEXT: s_addc_u32 s2, s10, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: s_addc_u32 s7, s7, 0 -; GCN-NEXT: s_mul_i32 s6, s9, s6 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s2, s7, s2 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s2, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s6, s7, s6 ; GCN-NEXT: s_add_u32 s2, s2, s6 -; GCN-NEXT: s_addc_u32 s8, 0, s7 -; GCN-NEXT: s_add_u32 s2, s11, s2 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s6, s9, s8 +; GCN-NEXT: s_addc_u32 s6, 0, s9 +; GCN-NEXT: s_add_u32 s2, s8, s2 +; GCN-NEXT: s_addc_u32 s6, s7, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 ; GCN-NEXT: s_mul_i32 s6, s6, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_add_u32 s6, s8, s6 @@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: s_add_i32 s10, s8, s7 ; GCN-NEXT: s_sub_i32 s8, 0, s10 ; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s9, s6, s7 ; GCN-NEXT: s_subb_u32 s12, s8, s5 ; GCN-NEXT: s_sub_u32 s13, s11, s4 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s13, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s5 +; GCN-NEXT: s_sub_u32 s9, s13, s4 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 ; GCN-NEXT: s_subb_u32 s6, 0, s10 @@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s9, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bdd22f25e91c8..b000fae124ede 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_add_u32 s2, s2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: s_addc_u32 s3, s3, s9 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s4, s6 -; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; SI-NEXT: s_or_b32 s6, s12, s13 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index fd461ac80ea55..775483c040b7f 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -786,12 +782,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s6, 0, s2 -; GCN-NEXT: s_subb_u32 s8, 0, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -800,118 +795,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_mul_i32 s8, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s5, s7 +; GCN-NEXT: s_mul_i32 s10, s4, s7 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_mul_i32 s12, s7, s8 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s8 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s7, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s6, s6, s9 +; GCN-NEXT: s_mul_i32 s9, s4, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_add_i32 s5, s9, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s4, s10, s4 +; GCN-NEXT: s_addc_u32 s4, s11, s9 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s5, s6, s5 +; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s9 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s6, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s4, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s5, 24 +; GCN-NEXT: s_mul_i32 s5, s5, 24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mul_i32 s5, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s4 -; GCN-NEXT: s_mul_i32 s11, s6, s4 -; GCN-NEXT: s_add_i32 s5, s12, s5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s5, s5, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 -; GCN-NEXT: s_mul_i32 s13, s4, s5 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: s_addc_u32 s11, s14, 0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_add_u32 s5, s10, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s4, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s4, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s5, s6, s11 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s4 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: s_add_u32 s5, s8, s5 -; GCN-NEXT: s_addc_u32 s5, s10, s6 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_addc_u32 s6, s6, 0 -; GCN-NEXT: s_mul_i32 s4, s9, s4 -; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: s_addc_u32 s6, 0, s6 -; GCN-NEXT: s_add_u32 s8, s11, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s4, s9, s6 -; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 -; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 -; GCN-NEXT: s_mul_i32 s4, s4, 24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s10, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s10 +; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s11, s1, s0 -; GCN-NEXT: s_sub_i32 s8, 0, s11 -; GCN-NEXT: s_mul_i32 s0, s2, s10 -; GCN-NEXT: s_sub_u32 s12, 24, s0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s9, s0, s1 -; GCN-NEXT: s_subb_u32 s13, s8, s3 -; GCN-NEXT: s_sub_u32 s14, s12, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s2 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s12, s11, s2 +; GCN-NEXT: s_subb_u32 s10, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s3 ; GCN-NEXT: s_cselect_b32 s13, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s8, s13, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, s3 +; GCN-NEXT: s_cselect_b32 s10, s12, s13 +; GCN-NEXT: s_add_u32 s12, s8, 1 ; GCN-NEXT: s_addc_u32 s13, 0, 0 -; GCN-NEXT: s_add_u32 s14, s10, 2 +; GCN-NEXT: s_add_u32 s14, s8, 2 ; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s14, s9 -; GCN-NEXT: s_cselect_b32 s9, s15, s13 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s10, s14, s12 +; GCN-NEXT: s_cselect_b32 s12, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s11 +; GCN-NEXT: s_subb_u32 s0, 0, s9 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s0, s2, s1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s9, 0 -; GCN-NEXT: s_cselect_b32 s1, s8, s10 +; GCN-NEXT: s_cselect_b32 s0, s12, 0 +; GCN-NEXT: s_cselect_b32 s1, s10, s8 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -937,8 +926,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -969,8 +956,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1307,8 +1292,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1336,8 +1319,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s2, s2, s8 ; GCN-IR-NEXT: s_subb_u32 s3, s3, 0 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-IR-NEXT: s_or_b32 s12, s12, s13 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 137dc1fe42294..28e6627b87413 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s6, 0, s2 -; GCN-NEXT: s_subb_u32 s8, 0, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_mul_i32 s8, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s5, s7 +; GCN-NEXT: s_mul_i32 s10, s4, s7 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_mul_i32 s12, s7, s8 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s8 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s7, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s6, s6, s9 +; GCN-NEXT: s_mul_i32 s9, s4, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_add_i32 s5, s9, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s4, s10, s4 +; GCN-NEXT: s_addc_u32 s4, s11, s9 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s5, s6, s5 +; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s9 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s6, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s4, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s5, 24 +; GCN-NEXT: s_mul_i32 s5, s5, 24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mul_i32 s5, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s4 -; GCN-NEXT: s_mul_i32 s11, s6, s4 -; GCN-NEXT: s_add_i32 s5, s12, s5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s5, s5, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 -; GCN-NEXT: s_mul_i32 s13, s4, s5 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: s_addc_u32 s11, s14, 0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_add_u32 s5, s10, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s4, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s4, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s5, s6, s11 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s4 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: s_add_u32 s5, s8, s5 -; GCN-NEXT: s_addc_u32 s5, s10, s6 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_addc_u32 s6, s6, 0 -; GCN-NEXT: s_mul_i32 s4, s9, s4 -; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: s_addc_u32 s6, 0, s6 -; GCN-NEXT: s_add_u32 s8, s11, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s4, s9, s6 -; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 -; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 -; GCN-NEXT: s_mul_i32 s4, s4, 24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: s_addc_u32 s8, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 @@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s0, s2, s8 ; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s8, s0, s1 ; GCN-NEXT: s_subb_u32 s12, s9, s3 ; GCN-NEXT: s_sub_u32 s13, s11, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s3 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s3 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s3 -; GCN-NEXT: s_sub_u32 s16, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s3 +; GCN-NEXT: s_sub_u32 s9, s13, s2 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_subb_u32 s0, 0, s10 @@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cselect_b32 s0, s8, s0 ; GCN-NEXT: s_cselect_b32 s1, s9, s11 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s8, s8, s10 ; GCN-IR-NEXT: s_subb_u32 s9, s9, 0 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-IR-NEXT: s_or_b32 s14, s14, s15 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index e8db6471b6a46..8a54ad301f48a 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_sub_u32 s2, s2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: s_subb_u32 s3, s3, s9 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s4, s4, s6 -; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; SI-NEXT: s_or_b32 s6, s12, s13 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1