diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h index d0ed3ff660d9b..ffbbb3313a371 100644 --- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h +++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h @@ -154,8 +154,10 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { ArrayRef regs() const { return ArrayRef(NewRegs).slice(FirstNew); } - /// createFrom - Create a new virtual register based on OldReg. - Register createFrom(Register OldReg); + /// createFrom - Create a new virtual register based on OldReg. If \p RC is + /// non-null, create the new virtual register from it instead. Subreg spills + /// will feed-in a subregclass derived from the regclass of OldReg. + Register createFrom(Register OldReg, const TargetRegisterClass *RC = nullptr); /// create - Create a new register with the same class and original slot as /// parent. diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 175f205328361..8d735799910de 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1210,7 +1210,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { virtual void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const { llvm_unreachable("Target didn't implement " "TargetInstrInfo::loadRegFromStackSlot!"); diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 54b3f6a04d241..5c271b359e8d0 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -438,6 +438,8 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { LaneBitmask LaneMask, SmallVectorImpl &Indexes) const; + unsigned getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const; + /// The lane masks returned by getSubRegIndexLaneMask() above can only be /// used to determine if sub-registers overlap - they can't be used to /// determine if a set of sub-registers completely cover another @@ -1225,6 +1227,11 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { return true; } + /// To enable the spill-restore of sub registers during RA. This would + /// eventually improve the register allocation for the functions that involve + /// subreg uses of register tuples. + virtual bool shouldEnableSubRegSpillRestore() const { return false; } + /// When prioritizing live ranges in register allocation, if this hook returns /// true then the AllocationPriority of the register class will be treated as /// more important than whether the range is local to a basic block or global. diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index c3e0964594bd5..dfe60764ea270 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -217,7 +217,8 @@ class InlineSpiller : public Spiller { bool coalesceStackAccess(MachineInstr *MI, Register Reg); bool foldMemoryOperand(ArrayRef>, MachineInstr *LoadMI = nullptr); - void insertReload(Register VReg, SlotIndex, MachineBasicBlock::iterator MI); + void insertReload(Register VReg, unsigned SubReg, SlotIndex, + MachineBasicBlock::iterator MI); void insertSpill(Register VReg, bool isKill, MachineBasicBlock::iterator MI); void spillAroundUses(Register Reg); @@ -1112,14 +1113,14 @@ foldMemoryOperand(ArrayRef> Ops, return true; } -void InlineSpiller::insertReload(Register NewVReg, +void InlineSpiller::insertReload(Register NewVReg, unsigned SubReg, SlotIndex Idx, MachineBasicBlock::iterator MI) { MachineBasicBlock &MBB = *MI->getParent(); MachineInstrSpan MIS(MI, &MBB); TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot, - MRI.getRegClass(NewVReg), &TRI, Register()); + MRI.getRegClass(NewVReg), &TRI, Register(), SubReg); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI); @@ -1248,10 +1249,39 @@ void InlineSpiller::spillAroundUses(Register Reg) { // Create a new virtual register for spill/fill. // FIXME: Infer regclass from instruction alone. - Register NewVReg = Edit->createFrom(Reg); + + unsigned SubReg = 0; + LaneBitmask CoveringLanes = LaneBitmask::getNone(); + // Identify the subreg use(s). Skip if the instruction defines the register. + // For copy bundles, get the covering lane masks. + if (TRI.shouldEnableSubRegSpillRestore() && !RI.Writes) { + for (auto [MI, OpIdx] : Ops) { + const MachineOperand &MO = MI->getOperand(OpIdx); + assert(MO.isReg() && MO.getReg() == Reg); + if (MO.isUse()) { + SubReg = MO.getSubReg(); + CoveringLanes |= TRI.getSubRegIndexLaneMask(SubReg); + } + } + } + + const TargetRegisterClass *OrigRC = MRI.getRegClass(Reg); + if (MI.isBundled() && CoveringLanes.any()) { + CoveringLanes = LaneBitmask(bit_ceil(CoveringLanes.getAsInteger()) - 1); + // Get the covering subreg index including the missing indices in the + // identified small range. Even if this is suboptimal, it is advantageous + // when the higher subreg components are not really involved in the bundle + // copy as we emit the subreg reload rather than the one for the entire + // tuple. + SubReg = TRI.getSubRegIdxFromLaneMask(CoveringLanes); + } + + const TargetRegisterClass *NewRC = + SubReg ? TRI.getSubRegisterClass(OrigRC, SubReg) : nullptr; + Register NewVReg = Edit->createFrom(Reg, NewRC); if (RI.Reads) - insertReload(NewVReg, Idx, &MI); + insertReload(NewVReg, SubReg, Idx, &MI); // Rewrite instruction operands. bool hasLiveDef = false; @@ -1259,7 +1289,10 @@ void InlineSpiller::spillAroundUses(Register Reg) { MachineOperand &MO = OpPair.first->getOperand(OpPair.second); MO.setReg(NewVReg); if (MO.isUse()) { - if (!OpPair.first->isRegTiedToDefOperand(OpPair.second)) + if (SubReg && !MI.isBundled()) + MO.setSubReg(0); + if (!OpPair.first->isRegTiedToDefOperand(OpPair.second) || + (SubReg && !MI.isBundled())) MO.setIsKill(); } else { if (!MO.isDead()) diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 5b0365da4e8c6..b1e8825467c04 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -52,8 +52,10 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(Register OldReg, return LI; } -Register LiveRangeEdit::createFrom(Register OldReg) { - Register VReg = MRI.cloneVirtualRegister(OldReg); +Register LiveRangeEdit::createFrom(Register OldReg, + const TargetRegisterClass *RC) { + Register VReg = + RC ? MRI.createVirtualRegister(RC) : MRI.cloneVirtualRegister(OldReg); if (VRM) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 971f822fa6c53..dcdc6aa8c7004 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -545,6 +545,16 @@ TargetRegisterInfo::getRegSizeInBits(Register Reg, return getRegSizeInBits(*RC); } +unsigned +TargetRegisterInfo::getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const { + for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) { + if (getSubRegIndexLaneMask(Idx) == LaneMask) + return Idx; + } + + return 0 /*NoSubRegister*/; +} + bool TargetRegisterInfo::getCoveringSubRegIndexes( const TargetRegisterClass *RC, LaneBitmask LaneMask, SmallVectorImpl &NeededIndexes) const { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 4b4073365483e..e5b616c0291b0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5798,7 +5798,7 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, void AArch64InstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 179574a73aa01..bf614b98cc988 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -360,7 +360,7 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // This tells target independent code that it is okay to pass instructions diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0236745c925ad..578939dfa4483 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1907,7 +1907,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); @@ -1915,6 +1915,14 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned SpillSize = TRI->getSpillSize(*RC); + assert(SubReg != AMDGPU::lo16 && SubReg != AMDGPU::hi16 && + "unhandled 16-bit subregister spilling"); + // For subreg reload, identify the start offset. + unsigned Offset = + SubReg ? llvm::countr_zero( + RI.getSubRegIndexLaneMask(SubReg).getAsInteger()) / + 2 + : 0; MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -1939,19 +1947,23 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); BuildMI(MBB, MI, DL, OpDesc, DestReg) - .addFrameIndex(FrameIndex) // addr - .addMemOperand(MMO) - .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); + .addFrameIndex(FrameIndex) // addr + .addImm(Offset) // offset + .addMemOperand(MMO) + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); return; } + // Convert from Word-offset to byte-ffset. + Offset *= 4; + unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, SpillSize, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset + .addImm(Offset) // offset .addMemOperand(MMO); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 65bebec2118bb..ea68804d7bfd4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -330,7 +330,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 54b584ea66965..e4df2f5727d3c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1095,7 +1095,7 @@ multiclass SI_SPILL_SGPR { def _RESTORE : PseudoInstSI < (outs sgpr_class:$data), - (ins i32imm:$addr)> { + (ins i32imm:$addr, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 48fbf228cfbe5..964cfc6b50af0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2272,6 +2272,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, if (OnlyToVGPR && !SpillToVGPR) return false; + int SubRegIdx = MI->getOperand(2).getImm(); if (SpillToVGPR) { for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { Register SubReg = @@ -2283,7 +2284,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) .addReg(Spill.VGPR) - .addImm(Spill.Lane); + .addImm(Spill.Lane + SubRegIdx); if (SB.NumSubRegs > 1 && i == 0) MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); if (Indexes) { @@ -2316,7 +2317,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) - .addImm(i); + .addImm(i + SubRegIdx); if (SB.NumSubRegs > 1 && i == 0) MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); if (Indexes) { @@ -3957,6 +3958,8 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, return NewSize <= DstSize || NewSize <= SrcSize; } +bool SIRegisterInfo::shouldEnableSubRegSpillRestore() const { return true; } + unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index fa0de60c11184..621a29b5089af 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -358,6 +358,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override; + bool shouldEnableSubRegSpillRestore() const override; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp index 05bcb3596ac48..a70de4dbaffe2 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp +++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp @@ -324,7 +324,7 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h index 2cf05ba57bd4b..7de1ffd404ba1 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.h +++ b/llvm/lib/Target/ARC/ARCInstrInfo.h @@ -77,7 +77,7 @@ class ARCInstrInfo : public ARCGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned subReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..59a249320a8ee 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1210,7 +1210,7 @@ Register ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, void ARMBaseInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..dfc24ba9acedc 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -218,7 +218,7 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 4b8c2fd569ead..09ed93d5570c5 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -145,7 +145,7 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void Thumb1InstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) || (DestReg.isPhysical() && isARMLowRegister(DestReg))) && "Unknown regclass!"); diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 68b326c0ebef6..43e9c2ab18325 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -50,7 +50,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool canCopyGluedNodeDuringSchedule(SDNode *N) const override; diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index f5653d459eac8..d0b764ef11c8a 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -210,7 +210,7 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void Thumb2InstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 1b0bf2d499510..92c44d282b495 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -51,7 +51,7 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index ce9908597dcac..d85ddb480fce3 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -162,7 +162,7 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h index 759aea2010962..61f6ad68b5a30 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.h +++ b/llvm/lib/Target/AVR/AVRInstrInfo.h @@ -85,7 +85,7 @@ class AVRInstrInfo : public AVRGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 409f8b4c253b8..5070282ce1edf 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -151,7 +151,7 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void BPFInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h index 911e880166d29..58d187d6dea61 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.h +++ b/llvm/lib/Target/BPF/BPFInstrInfo.h @@ -46,7 +46,7 @@ class BPFInstrInfo : public BPFGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index 619a797be6dc7..e573c28504211 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -437,7 +437,7 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void CSKYInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h index 6451c0af14fc0..91634052caba2 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h @@ -49,7 +49,7 @@ class CSKYInstrInfo : public CSKYGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 55bafdea234fd..a70f807c8bb47 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1012,7 +1012,7 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void HexagonInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 48adf82833f51..807547f986f0a 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -193,7 +193,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// This function is called for all pseudo instructions diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 02ed1001cd0d3..d3891fcfff26b 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -71,7 +71,7 @@ void LanaiInstrInfo::loadRegFromStackSlot( Register DestinationRegister, int FrameIndex, const TargetRegisterClass *RegisterClass, const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/, - MachineInstr::MIFlag /*Flags*/) const { + unsigned /*SubReg*/, MachineInstr::MIFlag /*Flags*/) const { DebugLoc DL; if (Position != MBB.end()) { DL = Position->getDebugLoc(); diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h index d98276243dc31..50e6e4e53e451 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h @@ -66,7 +66,7 @@ class LanaiInstrInfo : public LanaiGenInstrInfo { MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register DestinationRegister, int FrameIndex, const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo *RegisterInfo, Register VReg, + const TargetRegisterInfo *RegisterInfo, Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 90a4723c9a3ed..c24992e6e2d65 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -150,7 +150,7 @@ void LoongArchInstrInfo::storeRegToStackSlot( void LoongArchInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index f69a558bdeca9..cb060b7247393 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -42,7 +42,7 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Materializes the given integer Val into DstReg. diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp index c6be190bd1245..1ba8ef98b70d1 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp +++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp @@ -858,7 +858,7 @@ void M68kInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DstReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo(); assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) && diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h index 97615d60caa0b..4a929e83eb852 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.h +++ b/llvm/lib/Target/M68k/M68kInstrInfo.h @@ -287,7 +287,7 @@ class M68kInstrInfo : public M68kGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 65b4820752c94..1f0f8048cf295 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -59,7 +59,7 @@ void MSP430InstrInfo::storeRegToStackSlot( void MSP430InstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h index 316c136890bf8..b90adb397db97 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h @@ -48,7 +48,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; unsigned getInstSizeInBytes(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 2337ae7c079e7..c99cb5f2568e5 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -152,7 +152,7 @@ class MipsInstrInfo : public MipsGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override { loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0, Flags); } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index 23889531431ea..3c150b1e26785 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -47,7 +47,7 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo { * virtual void loadRegFromStackSlot( * MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, * unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, - * const TargetRegisterInfo *TRI, Register VReg, + * const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, * MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const; */ diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 3014aa6bfe31e..96f7b9e911598 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2083,7 +2083,7 @@ void PPCInstrInfo::loadRegFromStackSlotNoUpd( void PPCInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { // We need to avoid a situation in which the value from a VRRC register is // spilled using an Altivec instruction and reloaded into a VSRC register // using a VSX instruction. The issue with this is that the VSX diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index d67fc28935586..ea0fc026e4d1e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -585,7 +585,7 @@ class PPCInstrInfo : public PPCGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Emits a register reload without updating the register class for vector diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b05956b674d18..7bc7f9d4aec45 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -722,7 +722,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void RISCVInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL = diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index c5eddb9e90fbf..4438c0de4c915 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -119,7 +119,7 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; using TargetInstrInfo::foldMemoryOperandImpl; diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index f66eb9dbee2dc..fb2fc8c591731 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -567,7 +567,7 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void SparcInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h index 01d0204734943..5e7854d2ea09f 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -99,7 +99,7 @@ class SparcInstrInfo : public SparcGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; Register getGlobalBaseReg(MachineFunction *MF) const; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2e21f27c9032f..baf995f78f463 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1039,7 +1039,7 @@ void SystemZInstrInfo::storeRegToStackSlot( void SystemZInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 7b9ad7b87a14f..d247f40dc07cd 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -286,7 +286,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index d5e804afd27fe..4fc181ef72950 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -522,7 +522,7 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void VEInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h index 408d3ab9e05f5..116a1613b011e 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.h +++ b/llvm/lib/Target/VE/VEInstrInfo.h @@ -98,7 +98,7 @@ class VEInstrInfo : public VEGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// } Stack Spill & Reload diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..d9b35aaa33701 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4806,7 +4806,7 @@ void X86InstrInfo::storeRegToStackSlot( void X86InstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559bd9598..f43d1dfaab784 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -476,7 +476,7 @@ class X86InstrInfo final : public X86GenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index 1a9133aad4dd3..ab485d0043833 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -378,7 +378,7 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end() && !I->isDebugInstr()) diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h index 3543392653786..0660d9cc5f0f5 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.h +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h @@ -77,7 +77,7 @@ class XCoreInstrInfo : public XCoreGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool reverseBranchCondition( diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index be69cefb5b78f..bbac9d8df97fd 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -157,7 +157,7 @@ void XtensaInstrInfo::storeRegToStackSlot( void XtensaInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx); diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h index 1808cb36d8a9b..3b48aa221db1b 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h @@ -63,7 +63,7 @@ class XtensaInstrInfo : public XtensaGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + const TargetRegisterInfo *TRI, Register VReg, unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Get the load and store opcodes for a given register class and offset. diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 28cb8f871cb9e..d9eec45f11a57 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -4956,10 +4956,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4975,9 +4974,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5002,9 +5000,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5029,9 +5026,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5056,9 +5052,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5083,9 +5078,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5111,11 +5105,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5137,11 +5131,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5164,11 +5157,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5190,11 +5182,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5216,11 +5207,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5243,11 +5233,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5979,9 +5968,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5991,9 +5978,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6004,9 +5989,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6023,9 +6007,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6042,9 +6025,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6061,9 +6043,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6080,9 +6061,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6099,9 +6079,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6127,12 +6106,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6153,12 +6132,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6179,12 +6158,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6205,12 +6184,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6231,12 +6210,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6257,12 +6236,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8106,80 +8085,78 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_readlane_b32 s17, v21, 2 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 1 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v16, s16 ; SI-NEXT: s_lshl_b32 s16, s68, 8 ; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s66, 0xff ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s64, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s66, 0xff ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s64, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v21, 0 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s17, v21, 0 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 63 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 62 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 63 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 62 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v22, 0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v22, 1 +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v22, 2 +; SI-NEXT: v_readlane_b32 s17, v22, 0 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v22, 3 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s17, v22, 2 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: v_readlane_b32 s18, v22, 4 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -8208,16 +8185,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v22, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v22, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v22, 5 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v22, 8 -; SI-NEXT: v_readlane_b32 s17, v22, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 10 +; SI-NEXT: v_readlane_b32 s17, v22, 10 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 @@ -8243,15 +8217,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v22, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v22, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v22, 14 -; SI-NEXT: v_readlane_b32 s15, v22, 15 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v22, 16 +; SI-NEXT: v_readlane_b32 s15, v22, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 @@ -8277,15 +8249,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v22, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v22, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v22, 20 -; SI-NEXT: v_readlane_b32 s13, v22, 21 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v22, 22 +; SI-NEXT: v_readlane_b32 s13, v22, 22 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s14, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 @@ -8311,15 +8281,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v22, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v22, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v22, 26 -; SI-NEXT: v_readlane_b32 s11, v22, 27 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v22, 28 +; SI-NEXT: v_readlane_b32 s11, v22, 28 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 @@ -8344,15 +8312,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v22, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v22, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v22, 32 -; SI-NEXT: v_readlane_b32 s9, v22, 33 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v22, 34 +; SI-NEXT: v_readlane_b32 s9, v22, 34 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 @@ -8378,15 +8344,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v22, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v22, 37 ; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v22, 38 -; SI-NEXT: v_readlane_b32 s7, v22, 39 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v22, 40 +; SI-NEXT: v_readlane_b32 s7, v22, 40 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 @@ -8411,12 +8375,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s30, v20, 34 -; SI-NEXT: v_readlane_b32 s19, v22, 11 -; SI-NEXT: v_readlane_b32 s17, v22, 17 -; SI-NEXT: v_readlane_b32 s15, v22, 23 -; SI-NEXT: v_readlane_b32 s13, v22, 29 -; SI-NEXT: v_readlane_b32 s11, v22, 35 -; SI-NEXT: v_readlane_b32 s9, v22, 41 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s31, v20, 35 ; SI-NEXT: v_readlane_b32 s99, v20, 33 @@ -9373,9 +9331,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: v_readlane_b32 s12, v21, 26 -; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: v_readlane_b32 s13, v21, 0 ; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s13, s14, 8 +; VI-NEXT: s_lshl_b32 s13, s13, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 @@ -9403,9 +9361,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: v_readlane_b32 s10, v21, 21 -; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: v_readlane_b32 s11, v21, 2 ; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_lshl_b32 s11, s11, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 @@ -9433,9 +9391,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_readlane_b32 s8, v21, 16 -; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: v_readlane_b32 s9, v21, 4 ; VI-NEXT: s_and_b32 s8, s8, 0xff -; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 @@ -9463,9 +9421,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_readlane_b32 s6, v21, 11 -; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: v_readlane_b32 s7, v21, 6 ; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s7, s8, 8 +; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -9490,10 +9448,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s30, v20, 30 -; VI-NEXT: v_readlane_b32 s15, v21, 1 -; VI-NEXT: v_readlane_b32 s13, v21, 3 -; VI-NEXT: v_readlane_b32 s11, v21, 5 -; VI-NEXT: v_readlane_b32 s9, v21, 7 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_readlane_b32 s31, v20, 31 ; VI-NEXT: v_readlane_b32 s87, v20, 29 @@ -10464,9 +10418,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: v_readlane_b32 s6, v21, 5 -; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: v_readlane_b32 s7, v21, 0 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -10488,7 +10442,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_readlane_b32 s30, v20, 34 -; GFX9-NEXT: v_readlane_b32 s9, v21, 1 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s31, v20, 35 ; GFX9-NEXT: v_readlane_b32 s99, v20, 33 @@ -11409,23 +11362,23 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: s_and_b32 s0, s28, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s85, 8 ; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s29, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s83, 8 ; GFX11-NEXT: s_and_b32 s16, s82, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s17, v19, 2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -11435,8 +11388,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_and_b32 s2, s40, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s61, 8 ; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s41, 0xff @@ -11451,13 +11403,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 4 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: s_and_b32 s0, s14, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s58, 8 ; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s15, 0xff @@ -11477,7 +11430,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s65, 8 ; GFX11-NEXT: s_and_b32 s12, s64, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s14 ; GFX11-NEXT: s_and_b32 s12, s13, 0xff @@ -11492,16 +11444,16 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: s_and_b32 s0, s10, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s52, 8 ; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s11, 0xff @@ -11521,7 +11473,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s39, 8 ; GFX11-NEXT: s_and_b32 s8, s38, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s10 ; GFX11-NEXT: s_and_b32 s8, s9, 0xff @@ -11536,13 +11487,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 12 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: s_and_b32 s0, s6, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s56, 8 ; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff @@ -11562,7 +11514,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s47, 8 ; GFX11-NEXT: s_and_b32 s4, s104, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s6 ; GFX11-NEXT: s_and_b32 s4, s5, 0xff @@ -11580,9 +11531,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 @@ -42305,10 +42253,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -42324,9 +42271,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42351,9 +42297,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42378,9 +42323,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42405,9 +42349,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42432,9 +42375,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42460,11 +42402,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42486,11 +42428,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -42513,11 +42454,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -42539,11 +42479,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42565,11 +42504,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -42592,11 +42530,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -43328,9 +43265,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43340,9 +43275,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43353,9 +43286,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43372,9 +43304,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43391,9 +43322,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43410,9 +43340,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43429,9 +43358,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43448,9 +43376,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43476,12 +43403,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43502,12 +43429,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43528,12 +43455,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43554,12 +43481,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43580,12 +43507,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -43606,12 +43533,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -44795,9 +44722,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -45055,152 +44982,152 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v2, s5, 1.0 ; SI-NEXT: v_add_f32_e64 v1, s4, 1.0 ; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 8 ; SI-NEXT: v_add_f32_e64 v4, s7, 1.0 ; SI-NEXT: v_add_f32_e64 v3, s6, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 8 ; SI-NEXT: v_add_f32_e64 v6, s9, 1.0 ; SI-NEXT: v_add_f32_e64 v5, s8, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 8 ; SI-NEXT: v_add_f32_e64 v8, s11, 1.0 ; SI-NEXT: v_add_f32_e64 v7, s10, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 8 ; SI-NEXT: v_add_f32_e64 v10, s13, 1.0 ; SI-NEXT: v_add_f32_e64 v9, s12, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 8 ; SI-NEXT: v_add_f32_e64 v12, s15, 1.0 ; SI-NEXT: v_add_f32_e64 v11, s14, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 8 ; SI-NEXT: v_add_f32_e64 v16, s41, 1.0 ; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 8 ; SI-NEXT: v_add_f32_e64 v21, s43, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s42, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 8 -; SI-NEXT: v_add_f32_e64 v26, s45, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s44, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v27, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s44, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 8 -; SI-NEXT: v_add_f32_e64 v30, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s28, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[26:27], 8 +; SI-NEXT: v_add_f32_e64 v31, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s28, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[30:31], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[30:31], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 8 +; SI-NEXT: v_lshr_b64 v[13:14], v[30:31], 8 ; SI-NEXT: v_add_f32_e64 v36, s27, 1.0 ; SI-NEXT: v_add_f32_e64 v35, s26, 1.0 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 8 ; SI-NEXT: v_add_f32_e64 v49, s25, 1.0 @@ -45209,147 +45136,149 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 8 +; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[52:53], 24 ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v4 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v4 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v6 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v8 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v10 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v10 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12 -; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 -; SI-NEXT: v_add_f32_e64 v41, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v40, s20, 1.0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26 -; SI-NEXT: v_add_f32_e64 v58, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v57, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v46, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s18, 1.0 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 -; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v30 -; SI-NEXT: v_lshr_b64 v[32:33], v[40:41], 8 -; SI-NEXT: v_lshr_b64 v[37:38], v[45:46], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[57:58], 16 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24 -; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24 -; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8 -; SI-NEXT: v_lshr_b64 v[50:51], v[57:58], 24 -; SI-NEXT: v_lshr_b64 v[43:44], v[57:58], 8 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 +; SI-NEXT: v_add_f32_e64 v43, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v42, s20, 1.0 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v27 +; SI-NEXT: v_add_f32_e64 v59, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v58, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v44, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[42:43], 24 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v27 +; SI-NEXT: v_lshr_b64 v[23:24], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[58:59], 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v31 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[24:25], v[42:43], 8 +; SI-NEXT: v_lshr_b64 v[28:29], v[44:45], 24 +; SI-NEXT: v_lshr_b64 v[33:34], v[44:45], 8 +; SI-NEXT: v_lshr_b64 v[50:51], v[58:59], 24 +; SI-NEXT: v_lshr_b64 v[38:39], v[58:59], 8 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v36 ; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v53 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v41 -; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v46 -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v58 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v43 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v59 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: ; SI-NEXT: ; implicit-def: $sgpr46 @@ -45534,82 +45463,80 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: .LBB37_4: ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 34 -; SI-NEXT: v_mov_b32_e32 v54, s4 +; SI-NEXT: v_mov_b32_e32 v39, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 35 -; SI-NEXT: v_mov_b32_e32 v51, s4 +; SI-NEXT: v_mov_b32_e32 v34, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 36 -; SI-NEXT: v_mov_b32_e32 v39, s4 +; SI-NEXT: v_mov_b32_e32 v22, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 37 -; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: v_mov_b32_e32 v40, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 38 -; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_mov_b32_e32 v29, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 39 -; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 40 -; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: v_mov_b32_e32 v18, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 41 -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 42 -; SI-NEXT: v_mov_b32_e32 v24, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 43 ; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 44 +; SI-NEXT: v_readlane_b32 s4, v61, 42 ; SI-NEXT: v_mov_b32_e32 v19, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 45 +; SI-NEXT: v_readlane_b32 s4, v61, 43 ; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 46 +; SI-NEXT: v_readlane_b32 s4, v61, 44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 45 +; SI-NEXT: v_mov_b32_e32 v56, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 46 +; SI-NEXT: v_mov_b32_e32 v55, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 47 -; SI-NEXT: v_mov_b32_e32 v59, s4 +; SI-NEXT: v_mov_b32_e32 v57, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 48 -; SI-NEXT: v_mov_b32_e32 v56, s4 +; SI-NEXT: v_mov_b32_e32 v54, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 49 ; SI-NEXT: v_mov_b32_e32 v47, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 50 -; SI-NEXT: v_mov_b32_e32 v44, s4 +; SI-NEXT: v_mov_b32_e32 v41, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 51 -; SI-NEXT: v_mov_b32_e32 v55, s4 +; SI-NEXT: v_mov_b32_e32 v46, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 52 -; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_mov_b32_e32 v51, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 53 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 54 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 55 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 56 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 57 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 59 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 60 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 61 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 63 @@ -45621,27 +45548,27 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 5 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 6 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 7 @@ -45649,247 +45576,234 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 8 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 9 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 12 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 32 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s48 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 33 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 30 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 31 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 28 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 29 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 26 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 27 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 24 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 25 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 22 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 23 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 21 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 18 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 19 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 16 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 17 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 14 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 15 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 12 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 10 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 11 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 9 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 6 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 7 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 4 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 5 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 2 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 3 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 0 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s50 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s52 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s54 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s64 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s66 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s68 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s70 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s80 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s82 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s84 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s86 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s96 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s98 -; SI-NEXT: v_mov_b32_e32 v27, s62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s46 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s72 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s74 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s58 +; SI-NEXT: v_mov_b32_e32 v23, s74 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s60 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s76 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, s16 -; SI-NEXT: v_mov_b32_e32 v58, s17 -; SI-NEXT: v_mov_b32_e32 v45, s18 -; SI-NEXT: v_mov_b32_e32 v46, s19 -; SI-NEXT: v_mov_b32_e32 v40, s20 -; SI-NEXT: v_mov_b32_e32 v41, s21 +; SI-NEXT: v_mov_b32_e32 v23, s76 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s62 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v23, s78 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v58, s16 +; SI-NEXT: v_mov_b32_e32 v59, s17 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v45, s19 +; SI-NEXT: v_mov_b32_e32 v42, s20 +; SI-NEXT: v_mov_b32_e32 v43, s21 ; SI-NEXT: v_mov_b32_e32 v52, s22 ; SI-NEXT: v_mov_b32_e32 v53, s23 ; SI-NEXT: v_mov_b32_e32 v48, s24 ; SI-NEXT: v_mov_b32_e32 v49, s25 ; SI-NEXT: v_mov_b32_e32 v35, s26 ; SI-NEXT: v_mov_b32_e32 v36, s27 -; SI-NEXT: v_mov_b32_e32 v29, s28 -; SI-NEXT: v_mov_b32_e32 v30, s29 -; SI-NEXT: v_mov_b32_e32 v25, s44 -; SI-NEXT: v_mov_b32_e32 v26, s45 +; SI-NEXT: v_mov_b32_e32 v30, s28 +; SI-NEXT: v_mov_b32_e32 v31, s29 +; SI-NEXT: v_mov_b32_e32 v26, s44 +; SI-NEXT: v_mov_b32_e32 v27, s45 ; SI-NEXT: v_mov_b32_e32 v20, s42 ; SI-NEXT: v_mov_b32_e32 v21, s43 ; SI-NEXT: v_mov_b32_e32 v15, s40 @@ -45904,101 +45818,85 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s9 ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: v_mov_b32_e32 v2, s5 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s60 +; SI-NEXT: v_mov_b32_e32 v13, s72 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, s78 -; SI-NEXT: v_mov_b32_e32 v31, s88 -; SI-NEXT: v_mov_b32_e32 v32, s90 -; SI-NEXT: v_mov_b32_e32 v33, s92 -; SI-NEXT: v_mov_b32_e32 v37, s94 -; SI-NEXT: v_mov_b32_e32 v38, s30 +; SI-NEXT: v_mov_b32_e32 v23, s88 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, s90 +; SI-NEXT: v_mov_b32_e32 v28, s92 +; SI-NEXT: v_mov_b32_e32 v32, s94 +; SI-NEXT: v_mov_b32_e32 v33, s30 ; SI-NEXT: v_mov_b32_e32 v50, s34 -; SI-NEXT: v_mov_b32_e32 v42, s36 -; SI-NEXT: v_mov_b32_e32 v43, s38 +; SI-NEXT: v_mov_b32_e32 v37, s36 +; SI-NEXT: v_mov_b32_e32 v38, s38 ; SI-NEXT: .LBB37_5: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v43 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v43, v57, v43 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_or_b32_e32 v50, v50, v42 -; SI-NEXT: v_and_b32_e32 v42, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v50, v42, v50 -; SI-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v54 -; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v50, v50, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v39 -; SI-NEXT: v_or_b32_e32 v39, v39, v51 -; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v39, v50, v39 -; SI-NEXT: v_add_i32_e32 v50, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 ; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 -; SI-NEXT: v_or_b32_e32 v38, v39, v38 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v38, v58, v38 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v33, v37, v33 -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v37, v50, v37 +; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v23, v33, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v37, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_add_i32_e32 v34, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v22, v33, v22 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v22, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v28 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v32 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v34 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v24 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_readlane_b32 s30, v63, 34 @@ -46037,56 +45935,70 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_readlane_b32 s36, v63, 2 ; SI-NEXT: v_readlane_b32 s35, v63, 1 ; SI-NEXT: v_readlane_b32 s34, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v43 ; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v53 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v13, v13, v18 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v48 -; SI-NEXT: v_or_b32_e32 v14, v17, v14 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v48 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v60 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v55 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v54 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 @@ -46094,21 +46006,18 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v35 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 @@ -46117,85 +46026,77 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_and_b32_e32 v13, 0xff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v47 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v46 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v29 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v30 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v30 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v51 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v26 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v27 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 @@ -46204,34 +46105,31 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v14, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v14, 0xff, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v17, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v21 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 @@ -46243,26 +46141,24 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload @@ -46281,33 +46177,30 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_or_b32_e32 v13, v11, v13 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 @@ -46319,33 +46212,30 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 @@ -46357,21 +46247,18 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 @@ -46379,11 +46266,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 @@ -46395,32 +46282,30 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 @@ -46432,33 +46317,30 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -46470,34 +46352,31 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -46521,9 +46400,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -47493,9 +47372,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -47519,9 +47397,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -47545,9 +47422,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -47571,9 +47447,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -47597,9 +47472,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -47623,15 +47497,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 @@ -47649,15 +47521,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x58, v0 @@ -47675,15 +47545,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x60, v0 @@ -47701,15 +47569,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x68, v0 @@ -47727,15 +47593,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0 @@ -47753,15 +47617,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x78, v0 @@ -48770,9 +48632,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48789,9 +48650,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48808,9 +48668,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48827,9 +48686,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48846,9 +48704,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48865,9 +48722,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48884,9 +48740,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48913,9 +48768,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48938,9 +48792,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48963,9 +48816,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -48988,9 +48840,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -49013,9 +48864,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -49038,9 +48888,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -74723,10 +74572,9 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 @@ -74738,10 +74586,9 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 @@ -74753,10 +74600,9 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 @@ -79762,10 +79608,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79781,9 +79626,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79808,9 +79652,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79835,9 +79678,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79862,9 +79704,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79889,9 +79730,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79917,11 +79757,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79943,11 +79783,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -79970,11 +79809,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -79996,11 +79834,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80022,11 +79859,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -80049,11 +79885,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -80785,9 +80620,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -80797,9 +80630,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -80810,9 +80641,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80829,9 +80659,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80848,9 +80677,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80867,9 +80695,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80886,9 +80713,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80905,9 +80731,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80933,12 +80758,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -80959,12 +80784,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -80985,12 +80810,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -81011,12 +80836,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -81037,12 +80862,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -81063,12 +80888,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -82929,80 +82754,78 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_readlane_b32 s17, v21, 1 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 0 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v16, s16 ; SI-NEXT: s_lshl_b32 s16, s68, 8 ; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s66, 0xff ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s64, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s66, 0xff ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s64, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v22, 63 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s17, v22, 63 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v22, 62 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 61 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 62 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 61 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v22, 0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v22, 1 +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v22, 2 +; SI-NEXT: v_readlane_b32 s17, v22, 0 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v22, 3 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s17, v22, 2 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: v_readlane_b32 s18, v22, 4 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -83031,16 +82854,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v22, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v22, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v22, 5 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v22, 8 -; SI-NEXT: v_readlane_b32 s17, v22, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v22, 10 +; SI-NEXT: v_readlane_b32 s17, v22, 10 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 @@ -83066,15 +82886,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v22, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v22, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v22, 14 -; SI-NEXT: v_readlane_b32 s15, v22, 15 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v22, 16 +; SI-NEXT: v_readlane_b32 s15, v22, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 @@ -83100,15 +82918,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v22, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v22, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v22, 20 -; SI-NEXT: v_readlane_b32 s13, v22, 21 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v22, 22 +; SI-NEXT: v_readlane_b32 s13, v22, 22 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s14, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 @@ -83134,15 +82950,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v22, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v22, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v22, 26 -; SI-NEXT: v_readlane_b32 s11, v22, 27 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v22, 28 +; SI-NEXT: v_readlane_b32 s11, v22, 28 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 @@ -83168,15 +82982,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v22, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v22, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v22, 32 -; SI-NEXT: v_readlane_b32 s9, v22, 33 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v22, 34 +; SI-NEXT: v_readlane_b32 s9, v22, 34 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 @@ -83203,12 +83015,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s6, v22, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v22, 37 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s54, 0xff -; SI-NEXT: v_readlane_b32 s8, v22, 38 +; SI-NEXT: v_readlane_b32 s7, v22, 38 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 @@ -83233,12 +83044,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s30, v20, 34 -; SI-NEXT: v_readlane_b32 s19, v22, 11 -; SI-NEXT: v_readlane_b32 s17, v22, 17 -; SI-NEXT: v_readlane_b32 s15, v22, 23 -; SI-NEXT: v_readlane_b32 s13, v22, 29 -; SI-NEXT: v_readlane_b32 s11, v22, 35 -; SI-NEXT: v_readlane_b32 s9, v22, 39 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s31, v20, 35 ; SI-NEXT: v_readlane_b32 s99, v20, 33 @@ -84195,9 +84000,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s12 ; VI-NEXT: v_readlane_b32 s12, v21, 26 -; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: v_readlane_b32 s13, v21, 0 ; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s13, s14, 8 +; VI-NEXT: s_lshl_b32 s13, s13, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 @@ -84225,9 +84030,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s10 ; VI-NEXT: v_readlane_b32 s10, v21, 21 -; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: v_readlane_b32 s11, v21, 2 ; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_lshl_b32 s11, s11, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 @@ -84255,9 +84060,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_readlane_b32 s8, v21, 16 -; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: v_readlane_b32 s9, v21, 4 ; VI-NEXT: s_and_b32 s8, s8, 0xff -; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 @@ -84285,9 +84090,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_readlane_b32 s6, v21, 11 -; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: v_readlane_b32 s7, v21, 6 ; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s7, s8, 8 +; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -84312,10 +84117,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s30, v20, 30 -; VI-NEXT: v_readlane_b32 s15, v21, 1 -; VI-NEXT: v_readlane_b32 s13, v21, 3 -; VI-NEXT: v_readlane_b32 s11, v21, 5 -; VI-NEXT: v_readlane_b32 s9, v21, 7 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_readlane_b32 s31, v20, 31 ; VI-NEXT: v_readlane_b32 s87, v20, 29 @@ -85286,9 +85087,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: v_readlane_b32 s6, v21, 5 -; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: v_readlane_b32 s7, v21, 0 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -85310,7 +85111,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_readlane_b32 s30, v20, 34 -; GFX9-NEXT: v_readlane_b32 s9, v21, 1 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s31, v20, 35 ; GFX9-NEXT: v_readlane_b32 s99, v20, 33 @@ -86233,23 +86033,23 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: s_and_b32 s0, s28, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s85, 8 ; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s29, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s83, 8 ; GFX11-NEXT: s_and_b32 s16, s82, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s17, v19, 2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -86259,8 +86059,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_and_b32 s2, s40, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s61, 8 ; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s41, 0xff @@ -86275,13 +86074,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 4 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: s_and_b32 s0, s14, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s58, 8 ; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s15, 0xff @@ -86301,7 +86101,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s65, 8 ; GFX11-NEXT: s_and_b32 s12, s64, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s14 ; GFX11-NEXT: s_and_b32 s12, s13, 0xff @@ -86316,16 +86115,16 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: s_and_b32 s0, s10, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s52, 8 ; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s11, 0xff @@ -86345,7 +86144,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s39, 8 ; GFX11-NEXT: s_and_b32 s8, s38, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s10 ; GFX11-NEXT: s_and_b32 s8, s9, 0xff @@ -86360,13 +86158,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s3, v19, 12 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: s_and_b32 s0, s6, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s56, 8 ; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff @@ -86386,7 +86185,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s47, 8 ; GFX11-NEXT: s_and_b32 s4, s104, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s6 ; GFX11-NEXT: s_and_b32 s4, s5, 0xff @@ -86404,9 +86202,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 @@ -115117,10 +114912,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -115134,10 +114928,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -115151,9 +114944,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115176,9 +114968,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115203,9 +114994,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115230,9 +115020,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115257,9 +115046,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115285,11 +115073,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115311,11 +115099,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -115338,11 +115125,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -115365,11 +115151,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -115390,11 +115175,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -115417,11 +115201,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -116137,9 +115920,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116149,9 +115930,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116161,9 +115940,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116173,9 +115950,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116191,9 +115966,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116210,9 +115984,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116229,9 +116002,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116248,9 +116020,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116267,9 +116038,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116295,12 +116065,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116321,12 +116091,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116347,12 +116117,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116373,12 +116143,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116399,12 +116169,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116425,12 +116195,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -117614,9 +117384,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -117871,8 +117641,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 ; SI-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v4 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v6 ; SI-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -117921,23 +117698,23 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 ; SI-NEXT: v_add_f64 v[17:18], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v18 ; SI-NEXT: v_add_f64 v[19:20], s[28:29], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v18 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v20 ; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 @@ -117947,7 +117724,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v20 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v22 ; SI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 @@ -117957,58 +117734,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v22 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; SI-NEXT: v_add_f64 v[38:39], s[22:23], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[48:49], s[22:23], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v39 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v49 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 ; SI-NEXT: v_add_f64 v[52:53], s[20:21], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v39 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v49 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v53 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 -; SI-NEXT: v_add_f64 v[44:45], s[18:19], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v53 +; SI-NEXT: v_add_f64 v[43:44], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v45 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v53 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v45 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v44 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 8 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 8 -; SI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 24 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill @@ -118022,17 +117795,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 8 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 24 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill @@ -118046,8 +117819,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -118062,100 +117835,97 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 24 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 16 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 16 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 24 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 16 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 8 +; SI-NEXT: v_lshr_b64 v[35:36], v[23:24], 24 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 8 +; SI-NEXT: v_lshr_b64 v[38:39], v[48:49], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[39:40], v[48:49], 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[25:26], v[21:22], 24 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[47:48], v[23:24], 16 -; SI-NEXT: v_add_f64 v[58:59], s[16:17], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 8 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 8 +; SI-NEXT: v_lshr_b64 v[40:41], v[52:53], 24 +; SI-NEXT: v_lshr_b64 v[45:46], v[52:53], 8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v2 +; SI-NEXT: v_add_f64 v[57:58], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[41:42], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[43:44], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v42, v54 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_lshr_b64 v[54:55], v[43:44], 16 ; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v2 -; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 24 -; SI-NEXT: v_lshr_b64 v[49:50], v[38:39], 24 -; SI-NEXT: v_lshr_b64 v[40:41], v[38:39], 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[25:26], v[44:45], 8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_lshr_b64 v[50:51], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[52:53], 24 -; SI-NEXT: v_lshr_b64 v[54:55], v[52:53], 8 -; SI-NEXT: v_lshr_b64 v[26:27], v[58:59], 24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v2 -; SI-NEXT: v_mov_b32_e32 v51, v28 -; SI-NEXT: v_lshr_b64 v[42:43], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[55:56], v[44:45], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v43, v29 -; SI-NEXT: v_lshr_b64 v[56:57], v[44:45], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[58:59], 8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v30, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v45 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: v_lshr_b64 v[50:51], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[55:56], v[43:44], 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[57:58], 16 +; SI-NEXT: v_mov_b32_e32 v51, v27 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: v_lshr_b64 v[59:60], v[57:58], 24 +; SI-NEXT: v_lshr_b64 v[26:27], v[57:58], 8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v30, 8, v44 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: ; SI-NEXT: ; implicit-def: $sgpr46 @@ -118340,26 +118110,27 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: .LBB73_4: ; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 34 -; SI-NEXT: v_mov_b32_e32 v37, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 35 ; SI-NEXT: v_mov_b32_e32 v51, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 35 +; SI-NEXT: v_mov_b32_e32 v42, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 36 -; SI-NEXT: v_mov_b32_e32 v43, s4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v47, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 37 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v57, s4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v56, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 38 -; SI-NEXT: v_mov_b32_e32 v33, s4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 39 -; SI-NEXT: v_mov_b32_e32 v30, s4 -; SI-NEXT: v_mov_b32_e32 v29, s46 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s98 +; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 40 -; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 41 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 42 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -118418,27 +118189,27 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 56 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 57 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 58 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 59 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 60 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 61 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 62 @@ -118449,12 +118220,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s96 ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 @@ -118466,300 +118233,288 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 4 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 5 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 6 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 7 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 8 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 9 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 12 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_mov_b32_e32 v29, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_mov_b32_e32 v30, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 15 ; SI-NEXT: v_mov_b32_e32 v31, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: v_mov_b32_e32 v32, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v18, s5 -; SI-NEXT: v_mov_b32_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 0 -; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: v_mov_b32_e32 v59, s17 -; SI-NEXT: v_mov_b32_e32 v58, s16 -; SI-NEXT: v_mov_b32_e32 v45, s19 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s20 -; SI-NEXT: v_mov_b32_e32 v39, s23 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v16, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 -; SI-NEXT: v_mov_b32_e32 v14, s9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v27, s98 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s86 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v12, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v5, s40 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, s38 -; SI-NEXT: v_mov_b32_e32 v27, s36 -; SI-NEXT: v_mov_b32_e32 v26, s34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s30 -; SI-NEXT: v_mov_b32_e32 v56, s94 -; SI-NEXT: v_mov_b32_e32 v55, s92 -; SI-NEXT: v_mov_b32_e32 v54, s90 -; SI-NEXT: v_mov_b32_e32 v42, s88 -; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v40, s76 -; SI-NEXT: v_mov_b32_e32 v50, s74 -; SI-NEXT: v_mov_b32_e32 v49, s72 -; SI-NEXT: v_mov_b32_e32 v48, s62 -; SI-NEXT: v_mov_b32_e32 v47, s60 -; SI-NEXT: v_mov_b32_e32 v36, s58 -; SI-NEXT: v_mov_b32_e32 v35, s56 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s96 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s84 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s86 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s82 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s84 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s80 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s82 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s70 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s80 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s68 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s70 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s66 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s68 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s64 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s66 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s54 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s64 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s52 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s54 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s50 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s50 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: v_mov_b32_e32 v32, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 2 -; SI-NEXT: v_readlane_b32 s5, v61, 3 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 4 -; SI-NEXT: v_readlane_b32 s5, v61, 5 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 6 -; SI-NEXT: v_readlane_b32 s5, v61, 7 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 8 -; SI-NEXT: v_readlane_b32 s5, v61, 9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 10 -; SI-NEXT: v_readlane_b32 s5, v61, 11 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 12 -; SI-NEXT: v_readlane_b32 s5, v61, 13 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 14 -; SI-NEXT: v_readlane_b32 s5, v61, 15 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 16 -; SI-NEXT: v_readlane_b32 s5, v61, 17 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 18 -; SI-NEXT: v_readlane_b32 s5, v61, 19 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 20 -; SI-NEXT: v_readlane_b32 s5, v61, 21 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 22 -; SI-NEXT: v_readlane_b32 s5, v61, 23 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 24 -; SI-NEXT: v_readlane_b32 s5, v61, 25 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 26 -; SI-NEXT: v_readlane_b32 s5, v61, 27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 28 -; SI-NEXT: v_readlane_b32 s5, v61, 29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: v_readlane_b32 s4, v61, 30 -; SI-NEXT: v_readlane_b32 s5, v61, 31 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 32 -; SI-NEXT: v_readlane_b32 s5, v61, 33 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s48 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s48 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s4, v61, 32 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, s4 +; SI-NEXT: v_mov_b32_e32 v58, s17 +; SI-NEXT: v_mov_b32_e32 v57, s16 +; SI-NEXT: v_mov_b32_e32 v44, s19 +; SI-NEXT: v_mov_b32_e32 v43, s18 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v52, s20 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v18, s5 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v12, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v26, s38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, s36 +; SI-NEXT: v_mov_b32_e32 v59, s34 +; SI-NEXT: v_mov_b32_e32 v55, s30 +; SI-NEXT: v_mov_b32_e32 v54, s94 +; SI-NEXT: v_mov_b32_e32 v46, s92 +; SI-NEXT: v_mov_b32_e32 v45, s90 +; SI-NEXT: v_mov_b32_e32 v41, s88 +; SI-NEXT: v_mov_b32_e32 v40, s78 +; SI-NEXT: v_mov_b32_e32 v39, s76 +; SI-NEXT: v_mov_b32_e32 v38, s74 +; SI-NEXT: v_mov_b32_e32 v50, s72 +; SI-NEXT: v_mov_b32_e32 v37, s62 +; SI-NEXT: v_mov_b32_e32 v36, s60 +; SI-NEXT: v_mov_b32_e32 v35, s58 +; SI-NEXT: v_mov_b32_e32 v34, s56 +; SI-NEXT: v_mov_b32_e32 v33, s46 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: .LBB73_5: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xff, v58 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v46 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v31 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v32 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v31 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v55 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v43 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v55 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v46 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v60 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v30 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 @@ -118809,9 +118564,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_readlane_b32 s36, v63, 2 ; SI-NEXT: v_readlane_b32 s35, v63, 1 ; SI-NEXT: v_readlane_b32 s34, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 @@ -118819,25 +118571,25 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v52 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v53 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 @@ -118849,25 +118601,25 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v40 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v39 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v49 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v50 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v39 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xff, v49 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 @@ -118879,10 +118631,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v37 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v36 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v35 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 @@ -118891,12 +118643,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 @@ -118907,26 +118659,22 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v34 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 @@ -118943,20 +118691,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 @@ -118964,7 +118709,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 @@ -118981,20 +118726,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 @@ -119002,12 +118744,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 @@ -119019,20 +118761,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v18, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 @@ -119040,15 +118779,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -119057,23 +118796,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v13, v15 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -119095,20 +118831,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 @@ -119134,20 +118867,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 @@ -119172,20 +118902,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 @@ -119209,21 +118936,18 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 @@ -119231,11 +118955,13 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v34 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 @@ -119246,63 +118972,59 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v57 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -119322,9 +119044,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -119333,8 +119055,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -119602,43 +119324,41 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v2 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 @@ -119694,21 +119414,21 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v17 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v20 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v19 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v22 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v21 @@ -119720,23 +119440,25 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v5 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v24 @@ -119751,13 +119473,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v30 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v30 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v31 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v31 ; VI-NEXT: s_branch .LBB73_5 ; VI-NEXT: .LBB73_3: ; VI-NEXT: ; implicit-def: $sgpr46 @@ -119917,15 +119637,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: s_branch .LBB73_2 ; VI-NEXT: .LBB73_4: ; VI-NEXT: v_mov_b32_e32 v33, s71 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s70 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s66 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s65 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s64 @@ -119968,14 +119690,13 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 3 ; VI-NEXT: v_mov_b32_e32 v52, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_mov_b32_e32 v46, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 5 ; VI-NEXT: v_mov_b32_e32 v37, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 6 ; VI-NEXT: v_mov_b32_e32 v43, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 8 ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -119983,8 +119704,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_mov_b32_e32 v42, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 11 ; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_mov_b32_e32 v40, s48 @@ -119992,7 +119712,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 13 ; VI-NEXT: v_mov_b32_e32 v44, s4 @@ -120028,10 +119748,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v40, s36 -; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -120068,23 +119788,23 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 37 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 ; VI-NEXT: v_mov_b32_e32 v53, s4 @@ -120095,7 +119815,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 46 ; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 ; VI-NEXT: v_mov_b32_e32 v54, s4 @@ -120108,12 +119828,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s30 ; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 ; VI-NEXT: v_mov_b32_e32 v61, s4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s30 ; VI-NEXT: v_readlane_b32 s4, v62, 55 ; VI-NEXT: v_mov_b32_e32 v36, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 @@ -120148,15 +119868,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v28, s21 ; VI-NEXT: v_mov_b32_e32 v29, s18 ; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s90 ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 -; VI-NEXT: v_mov_b32_e32 v42, s70 ; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_mov_b32_e32 v46, v38 -; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s90 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v40, s88 @@ -120189,15 +119906,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v40, s46 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, v43 +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v46 +; VI-NEXT: v_mov_b32_e32 v46, v38 +; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: .LBB73_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 ; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v50 -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s30, v63, 30 ; VI-NEXT: v_readlane_b32 s31, v63, 31 ; VI-NEXT: v_readlane_b32 s87, v63, 29 @@ -120230,8 +119952,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s36, v63, 2 ; VI-NEXT: v_readlane_b32 s35, v63, 1 ; VI-NEXT: v_readlane_b32 s34, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v34, v50, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -120242,13 +119964,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120263,13 +119984,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v30 ; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120284,9 +120004,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120305,9 +120024,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120326,9 +120044,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120339,12 +120056,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120352,9 +120069,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120365,12 +120081,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120378,9 +120094,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120404,9 +120119,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120430,9 +120144,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120456,15 +120169,13 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 @@ -120486,8 +120197,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_or_b32_sdwa v9, v38, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -120508,8 +120218,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v52 ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -120530,12 +120239,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v43, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen @@ -120553,10 +120259,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -120569,11 +120274,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -120582,14 +120286,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload @@ -120607,8 +120309,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -121491,25 +121193,24 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v37, v37, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v58, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v41 @@ -121550,48 +121251,47 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s36, v63, 2 ; GFX9-NEXT: v_readlane_b32 s35, v63, 1 ; GFX9-NEXT: v_readlane_b32 s34, v63, 0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v30 ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v50, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v29, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v25, v29, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v48, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v37, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v47 @@ -121600,9 +121300,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121614,9 +121313,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121628,9 +121326,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121640,11 +121337,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -121657,9 +121354,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121676,13 +121372,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 @@ -121695,9 +121390,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121724,9 +121418,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121749,9 +121442,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121774,9 +121466,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121799,9 +121490,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -121820,15 +121510,14 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 @@ -121847,13 +121536,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164453,11 +164141,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43 -; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164473,9 +164160,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164500,9 +164186,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164521,10 +164206,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164538,10 +164222,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164555,10 +164238,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 ; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164572,9 +164254,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164599,9 +164280,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164626,9 +164306,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164653,9 +164332,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164681,11 +164359,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164707,11 +164385,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164757,11 +164435,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -164783,10 +164460,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164800,10 +164476,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 ; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166085,16 +165760,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 @@ -166103,7 +165768,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload @@ -166126,7 +165799,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen @@ -166138,9 +165813,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166157,9 +165831,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166176,9 +165849,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166195,9 +165867,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:28 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166214,9 +165885,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166233,9 +165903,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166252,9 +165921,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166270,9 +165938,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166284,11 +165950,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 ; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166302,10 +165966,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166319,10 +165982,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166340,12 +166002,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166366,12 +166028,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166392,12 +166054,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -166418,12 +166080,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -168901,8 +168563,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -168975,553 +168637,568 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v46, v21 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v18, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s28 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v35 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v48 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v55 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v53, 1.0, v45 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_mov_b32_e32 v42, v37 -; SI-NEXT: v_alignbit_b32 v37, v2, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v44, v4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_mov_b32_e32 v51, v27 +; SI-NEXT: v_alignbit_b32 v39, v2, v11, 16 +; SI-NEXT: v_alignbit_b32 v27, v44, v4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_readfirstlane_b32 s5, v27 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_alignbit_b32 v2, v2, v15, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 ; SI-NEXT: v_writelane_b32 v62, s7, 1 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_alignbit_b32 v15, v52, v6, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v14, v52, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v15 ; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_alignbit_b32 v19, v45, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v2, v8, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: v_alignbit_b32 v2, v2, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v56 +; SI-NEXT: v_alignbit_b32 v2, v2, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 ; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_alignbit_b32 v47, v45, v47, 16 +; SI-NEXT: v_alignbit_b32 v56, v8, v56, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_readfirstlane_b32 s5, v47 -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v58 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_mov_b32_e32 v4, v58 -; SI-NEXT: v_alignbit_b32 v58, v8, v41, 16 +; SI-NEXT: v_alignbit_b32 v58, v16, v41, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_readfirstlane_b32 s5, v58 -; SI-NEXT: v_alignbit_b32 v2, v2, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v2, v2, v60, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v23, v22 -; SI-NEXT: v_mov_b32_e32 v40, v36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v42, v36 +; SI-NEXT: v_mov_b32_e32 v50, v29 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_alignbit_b32 v41, v15, v6, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v41 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v47 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_alignbit_b32 v32, v1, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s5, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 8 +; SI-NEXT: v_alignbit_b32 v41, v1, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s5, v41 +; SI-NEXT: v_mov_b32_e32 v9, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v2, v2, v61, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v59, v1, v13, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v59 -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 8 -; SI-NEXT: v_alignbit_b32 v61, v1, v17, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v2, v2, v21, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_alignbit_b32 v2, v2, v13, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v2, v2, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_alignbit_b32 v59, v1, v17, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v59 ; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_alignbit_b32 v60, v2, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v57, v2, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_alignbit_b32 v1, v2, v46, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s5, v57 ; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 8 -; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v23 -; SI-NEXT: v_mov_b32_e32 v5, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v2, v26, 16 +; SI-NEXT: v_alignbit_b32 v25, v2, v23, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: v_readfirstlane_b32 s5, v25 -; SI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v2, v2, v24, 16 ; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v2, v30, 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v1, v22 +; SI-NEXT: v_alignbit_b32 v22, v2, v29, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: v_alignbit_b32 v2, v2, v27, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v34, 16 ; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_alignbit_b32 v17, v2, v36, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v17, v2, v36, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: v_alignbit_b32 v2, v2, v34, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v21, 16 ; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v35, v2, v28, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v43 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: v_mov_b32_e32 v20, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_alignbit_b32 v30, v2, v36, 16 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_alignbit_b32 v2, v2, v39, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v30 +; SI-NEXT: v_readfirstlane_b32 s5, v35 +; SI-NEXT: v_alignbit_b32 v2, v2, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[64:65], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[68:69], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v57, v2, v39, 16 +; SI-NEXT: v_alignbit_b32 v14, v2, v5, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_readfirstlane_b32 s5, v57 -; SI-NEXT: v_alignbit_b32 v2, v2, v50, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_alignbit_b32 v2, v2, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 ; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: v_mov_b32_e32 v49, v27 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v2, v38, 16 +; SI-NEXT: v_alignbit_b32 v11, v2, v40, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_readfirstlane_b32 s5, v46 +; SI-NEXT: v_mov_b32_e32 v27, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43 +; SI-NEXT: v_readfirstlane_b32 s5, v11 ; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v27 ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[84:85], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[96:97], s[4:5], 8 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v2, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; SI-NEXT: v_alignbit_b32 v36, v2, v53, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s5, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v41 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v59 +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v48 +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v56 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; SI-NEXT: v_mov_b32_e32 v40, v30 +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v30 +; SI-NEXT: v_mov_b32_e32 v30, v39 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v43, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v38 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; SI-NEXT: v_mov_b32_e32 v18, v21 +; SI-NEXT: v_mov_b32_e32 v28, v60 +; SI-NEXT: v_mov_b32_e32 v60, v23 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: v_mov_b32_e32 v46, v13 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v20, v31 +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: v_mov_b32_e32 v43, v38 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_mov_b32_e32 v42, v36 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_mov_b32_e32 v40, v30 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_mov_b32_e32 v50, v29 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr54 @@ -169534,120 +169211,82 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: .LBB91_3: ; %Flow +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v10, v3 ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v10, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s52, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -169658,10 +169297,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s86, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -169671,109 +169313,167 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s80, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s86, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v6, v4, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v16, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s52, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s38, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s66, v13 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s80, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v15, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s66, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v36, v4, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_readfirstlane_b32 s5, v36 +; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v15, v7, v6, 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 +; SI-NEXT: v_alignbit_b32 v11, v6, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v6, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v7, v6, 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v57, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_readfirstlane_b32 s87, v11 +; SI-NEXT: v_readfirstlane_b32 s30, v8 +; SI-NEXT: s_lshr_b64 s[84:85], s[86:87], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[86:87], 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v16, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v14, v6, v3, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v18, v7, v6, 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v6, v3, 16 +; SI-NEXT: v_alignbit_b32 v35, v6, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_alignbit_b32 v18, v9, v7, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v20, v9, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v17, v7, v3, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s38, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s90, v16 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s90, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s76, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s30, v23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s76, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s62, v20 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s62, v21 ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_readfirstlane_b32 s87, v46 -; SI-NEXT: v_readfirstlane_b32 s81, v57 -; SI-NEXT: v_readfirstlane_b32 s67, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s81, v14 +; SI-NEXT: v_readfirstlane_b32 s67, v35 +; SI-NEXT: v_readfirstlane_b32 s53, v17 +; SI-NEXT: s_lshr_b64 s[48:49], s[52:53], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[52:53], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 8 ; SI-NEXT: s_lshr_b64 s[54:55], s[66:67], 24 ; SI-NEXT: s_lshr_b64 s[64:65], s[66:67], 16 ; SI-NEXT: s_lshr_b64 s[68:69], s[66:67], 8 @@ -169781,277 +169481,249 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshr_b64 s[70:71], s[80:81], 16 ; SI-NEXT: s_lshr_b64 s[82:83], s[80:81], 8 ; SI-NEXT: s_lshr_b64 s[80:81], s[86:87], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[86:87], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[86:87], 8 ; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v17, v7, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s53, v17 -; SI-NEXT: s_lshr_b64 s[48:49], s[52:53], 24 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v21, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_readfirstlane_b32 s56, v21 -; SI-NEXT: s_lshr_b64 s[50:51], s[52:53], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; SI-NEXT: v_alignbit_b32 v22, v9, v3, 16 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v23, v12, v10, 16 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s56, v23 ; SI-NEXT: v_readfirstlane_b32 s39, v22 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 ; SI-NEXT: s_lshr_b64 s[36:37], s[38:39], 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v24, v15, v13, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v25, v10, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_readfirstlane_b32 s31, v25 -; SI-NEXT: v_readfirstlane_b32 s26, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_lshr_b64 s[94:95], s[30:31], 16 ; SI-NEXT: s_lshr_b64 s[34:35], s[30:31], 8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s42, v26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v27, v18, v16, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_alignbit_b32 v57, v12, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v60, v12, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s91, v60 -; SI-NEXT: v_readfirstlane_b32 s14, v27 -; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_readfirstlane_b32 s91, v57 ; SI-NEXT: s_lshr_b64 s[88:89], s[90:91], 16 ; SI-NEXT: s_lshr_b64 s[92:93], s[90:91], 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v29, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s8, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v27, v15, v13, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s26, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_alignbit_b32 v61, v11, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s77, v61 -; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_alignbit_b32 v59, v13, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: v_readfirstlane_b32 s77, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 ; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 ; SI-NEXT: s_lshr_b64 s[78:79], s[76:77], 8 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s20, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v18 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_alignbit_b32 v21, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v41, v15, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s14, v23 +; SI-NEXT: v_readfirstlane_b32 s63, v41 +; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v30, v20, v18, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v59, v36, v3, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s63, v59 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_readfirstlane_b32 s8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v21 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_alignbit_b32 v41, v49, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v12 +; SI-NEXT: v_readfirstlane_b32 s57, v32 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v10 ; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v41 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_alignbit_b32 v58, v32, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_alignbit_b32 v58, v37, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_readfirstlane_b32 s43, v58 ; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v47, v45, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s27, v47 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 +; SI-NEXT: v_alignbit_b32 v56, v26, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s27, v56 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v56 ; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v19, v11, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v19, v45, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v19 ; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v14, v52, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; SI-NEXT: v_alignbit_b32 v48, v52, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s15, v48 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v48 ; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v11, v44, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 +; SI-NEXT: v_alignbit_b32 v49, v44, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_readfirstlane_b32 s9, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 ; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 ; SI-NEXT: v_writelane_b32 v62, s6, 0 @@ -170069,8 +169741,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshr_b64 s[30:31], s[38:39], 24 ; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 8 ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 ; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s6, 0xff @@ -170082,27 +169754,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s16, 8 ; SI-NEXT: s_lshl_b32 s6, s8, 24 ; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 1 ; SI-NEXT: v_readlane_b32 s99, v63, 33 ; SI-NEXT: v_readlane_b32 s97, v63, 31 ; SI-NEXT: v_readlane_b32 s87, v63, 29 @@ -170130,22 +169801,22 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s22, 8 ; SI-NEXT: s_lshl_b32 s6, s14, 24 ; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v60 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -170157,23 +169828,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v15 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s28, 8 -; SI-NEXT: s_lshl_b32 s6, s20, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_lshl_b32 s6, s20, 24 ; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -170184,12 +169853,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 @@ -170199,7 +169868,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_lshl_b32 s6, s26, 24 ; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -170211,12 +169880,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen @@ -170234,16 +169903,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_lshl_b32 s5, s72, 8 ; SI-NEXT: s_lshl_b32 s6, s56, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v5 @@ -170262,16 +169933,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_lshl_b32 s6, s62, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 @@ -170290,20 +169963,22 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 ; SI-NEXT: s_lshl_b32 s5, s92, 8 ; SI-NEXT: s_lshl_b32 s6, s76, 24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -170320,27 +169995,29 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 ; SI-NEXT: s_lshl_b32 s5, s34, 8 ; SI-NEXT: s_lshl_b32 s6, s90, 24 ; SI-NEXT: v_readlane_b32 s34, v63, 0 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -170353,10 +170030,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_lshl_b32 s5, s38, 8 ; SI-NEXT: s_lshl_b32 s6, s30, 24 @@ -170375,6 +170052,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -170384,21 +170062,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s6, s48, 24 ; SI-NEXT: v_readlane_b32 s52, v63, 10 ; SI-NEXT: v_readlane_b32 s48, v63, 6 ; SI-NEXT: v_readlane_b32 s36, v63, 2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -170410,6 +170085,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -170419,21 +170095,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s68, 8 ; SI-NEXT: s_lshl_b32 s6, s54, 24 ; SI-NEXT: v_readlane_b32 s68, v63, 18 ; SI-NEXT: v_readlane_b32 s54, v63, 12 ; SI-NEXT: v_readlane_b32 s50, v63, 8 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -170455,10 +170128,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_lshl_b32 s5, s82, 8 ; SI-NEXT: s_lshl_b32 s6, s66, 24 @@ -170486,10 +170159,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_lshl_b32 s6, s80, 24 ; SI-NEXT: v_readlane_b32 s96, v63, 30 @@ -170521,10 +170194,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s86, 24 ; SI-NEXT: v_readlane_b32 s86, v63, 28 ; SI-NEXT: v_readlane_b32 s84, v63, 26 @@ -170545,7 +170218,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -170556,15 +170229,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: v_readlane_b32 s98, v63, 32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_readlane_b32 s98, v63, 32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -170586,8 +170257,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -172058,9 +171729,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -172077,9 +171747,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172104,9 +171773,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172155,9 +171823,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172182,9 +171849,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172209,9 +171875,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172236,9 +171901,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172263,9 +171927,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172289,10 +171952,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -172306,10 +171968,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -172324,10 +171985,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -172345,11 +172006,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172364,10 +172024,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -172386,10 +172045,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -172404,10 +172062,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -173597,26 +173254,27 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: s_branch .LBB91_2 ; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v44, s6 +; GFX9-NEXT: v_readlane_b32 s6, v62, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v10, s34 +; GFX9-NEXT: s_mov_b32 s34, s6 +; GFX9-NEXT: v_readlane_b32 s6, v62, 6 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v1, s76 +; GFX9-NEXT: v_mov_b32_e32 v11, s36 +; GFX9-NEXT: s_mov_b32 s36, s6 +; GFX9-NEXT: v_readlane_b32 s6, v62, 4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v1, s77 +; GFX9-NEXT: v_mov_b32_e32 v29, s38 +; GFX9-NEXT: s_mov_b32 s38, s6 +; GFX9-NEXT: v_readlane_b32 s6, v62, 2 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v46, s51 -; GFX9-NEXT: v_mov_b32_e32 v56, s50 -; GFX9-NEXT: v_mov_b32_e32 v58, s49 ; GFX9-NEXT: v_mov_b32_e32 v60, s48 -; GFX9-NEXT: v_mov_b32_e32 v27, s39 -; GFX9-NEXT: v_mov_b32_e32 v29, s38 -; GFX9-NEXT: v_mov_b32_e32 v10, s34 -; GFX9-NEXT: v_mov_b32_e32 v11, s36 -; GFX9-NEXT: v_readlane_b32 s34, v62, 8 -; GFX9-NEXT: v_readlane_b32 s36, v62, 6 -; GFX9-NEXT: v_readlane_b32 s38, v62, 4 -; GFX9-NEXT: v_readlane_b32 s48, v62, 2 -; GFX9-NEXT: v_readlane_b32 s50, v62, 0 +; GFX9-NEXT: s_mov_b32 s48, s6 +; GFX9-NEXT: v_readlane_b32 s6, v62, 0 ; GFX9-NEXT: v_mov_b32_e32 v42, s46 ; GFX9-NEXT: v_mov_b32_e32 v41, s47 ; GFX9-NEXT: v_mov_b32_e32 v55, s15 @@ -173642,22 +173300,25 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v25, s52 ; GFX9-NEXT: v_mov_b32_e32 v6, s70 ; GFX9-NEXT: v_mov_b32_e32 v12, s7 -; GFX9-NEXT: v_mov_b32_e32 v44, s6 ; GFX9-NEXT: v_mov_b32_e32 v23, s71 ; GFX9-NEXT: v_mov_b32_e32 v43, s67 ; GFX9-NEXT: v_mov_b32_e32 v24, s69 +; GFX9-NEXT: v_mov_b32_e32 v46, s51 ; GFX9-NEXT: v_mov_b32_e32 v21, s68 ; GFX9-NEXT: v_mov_b32_e32 v45, s64 ; GFX9-NEXT: v_mov_b32_e32 v22, s66 +; GFX9-NEXT: v_mov_b32_e32 v56, s50 ; GFX9-NEXT: v_mov_b32_e32 v19, s65 ; GFX9-NEXT: v_mov_b32_e32 v47, s99 ; GFX9-NEXT: v_mov_b32_e32 v20, s55 +; GFX9-NEXT: v_mov_b32_e32 v58, s49 ; GFX9-NEXT: v_mov_b32_e32 v17, s54 ; GFX9-NEXT: v_mov_b32_e32 v57, s96 ; GFX9-NEXT: v_mov_b32_e32 v18, s98 ; GFX9-NEXT: v_mov_b32_e32 v15, s97 ; GFX9-NEXT: v_mov_b32_e32 v59, s85 ; GFX9-NEXT: v_mov_b32_e32 v16, s87 +; GFX9-NEXT: v_mov_b32_e32 v27, s39 ; GFX9-NEXT: v_mov_b32_e32 v13, s86 ; GFX9-NEXT: v_mov_b32_e32 v61, s82 ; GFX9-NEXT: v_mov_b32_e32 v14, s84 @@ -173710,11 +173371,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_readlane_b32 s47, v62, 47 ; GFX9-NEXT: v_readlane_b32 s8, v62, 48 ; GFX9-NEXT: v_readlane_b32 s7, v62, 49 -; GFX9-NEXT: v_readlane_b32 s35, v62, 9 -; GFX9-NEXT: v_readlane_b32 s37, v62, 7 -; GFX9-NEXT: v_readlane_b32 s39, v62, 5 -; GFX9-NEXT: v_readlane_b32 s49, v62, 3 -; GFX9-NEXT: v_readlane_b32 s51, v62, 1 +; GFX9-NEXT: s_mov_b32 s50, s6 ; GFX9-NEXT: .LBB91_5: ; %end ; GFX9-NEXT: s_and_b32 s6, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 @@ -174447,10 +174104,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v26.l ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11 ; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s42, 0x10010 @@ -174466,7 +174123,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7 ; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 @@ -174485,7 +174142,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42 ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s73 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v30.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 @@ -174530,8 +174187,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[4:5] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v11 ; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -174775,7 +174432,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v38.l ; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo @@ -174816,7 +174473,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l @@ -174838,7 +174495,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1 ; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s8, 0x10010 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v51.l ; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s8 @@ -174853,7 +174510,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v12, v8 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v50.l @@ -174898,13 +174555,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s5, 16 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v12, v2 ; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s4, 0x10010 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s4 @@ -174941,14 +174598,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v55.l ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[22:23] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[20:21] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v2.l -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[20:21] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[10:11] ; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19] ; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5 ; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff @@ -174958,23 +174615,25 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s12 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s14, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[4:5] ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v65 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[64:65] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[68:69] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v65 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63 ; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s11, 0x10010 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v64 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v65 ; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s11 ; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22 ; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff ; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s11, s12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v65 ; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v64 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v69 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v69 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v68 @@ -174991,14 +174650,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62 @@ -175187,19 +174844,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74 ; GFX11-TRUE16-NEXT: s_branch .LBB91_2 ; GFX11-TRUE16-NEXT: .LBB91_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, s4 :: v_dual_mov_b32 v70, s83 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v43, 2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s94 :: v_dual_mov_b32 v14, s30 -; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s49 :: v_dual_mov_b32 v5, s35 -; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 3 -; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_lo, v43, 6 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 0 -; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v43, 4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, s42 :: v_dual_mov_b32 v54, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v43, 6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v49, s98 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v43, 0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, s46 :: v_dual_mov_b32 v38, s47 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s97 :: v_dual_mov_b32 v39, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s30, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v43, 4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v36, s60 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v32, s61 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v33, s62 @@ -175208,22 +174866,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s7 :: v_dual_mov_b32 v27, s28 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s29 :: v_dual_mov_b32 v25, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s87 :: v_dual_mov_b32 v64, s86 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s85 :: v_dual_mov_b32 v10, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, s4 :: v_dual_mov_b32 v68, s48 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, s81 :: v_dual_mov_b32 v66, s84 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s83 :: v_dual_mov_b32 v69, s70 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s82 :: v_dual_mov_b32 v23, s80 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v71, s66 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s39 :: v_dual_mov_b32 v21, s69 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v81, s55 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v19, s65 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v83, s51 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v17, s54 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v11, s52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v12, s50 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s78 :: v_dual_mov_b32 v7, s88 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s85 :: v_dual_mov_b32 v68, s48 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v66, s84 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, s81 :: v_dual_mov_b32 v22, s82 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, s70 :: v_dual_mov_b32 v80, s71 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s80 :: v_dual_mov_b32 v20, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v82, s68 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s69 :: v_dual_mov_b32 v18, s67 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v84, s38 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s65 :: v_dual_mov_b32 v16, s64 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v86, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s54 :: v_dual_mov_b32 v10, s52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v96, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s50 :: v_dual_mov_b32 v4, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v87, s34 :: v_dual_mov_b32 v2, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s35 :: v_dual_mov_b32 v6, s78 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v8, s90 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s88 :: v_dual_mov_b32 v12, s92 ; GFX11-TRUE16-NEXT: s_mov_b32 s58, s11 ; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 8 ; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 9 @@ -175255,12 +174914,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_readlane_b32 s92, v42, 3 ; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v42, 5 -; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_hi, v43, 7 ; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 6 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v42, 7 ; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v42, 8 -; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v43, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 s34, s4 ; GFX11-TRUE16-NEXT: .LBB91_5: ; %end ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s104, 8 @@ -175423,15 +175080,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v5, 8, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v4, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v24 @@ -175440,9 +175097,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v30 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v26, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v26, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v25, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v25, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v30, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v85 @@ -175453,25 +175110,25 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v83 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v26, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v26, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v28, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v29, v30 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v11, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v12, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v10, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v11, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v82 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 @@ -175481,7 +175138,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v81 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v39 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v17 @@ -175501,7 +175158,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v18, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -175511,13 +175168,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v17, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v48 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v53 @@ -175526,49 +175183,49 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v50 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v49 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v52 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v21, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v9 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v8, v13 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v41, 7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v10, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v9, v3 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[4:7], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:112 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v41, 8 ; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v41, 6 ; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v41, 5 @@ -176760,19 +176417,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 ; GFX11-FAKE16-NEXT: s_branch .LBB91_2 ; GFX11-FAKE16-NEXT: .LBB91_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s4, v43, 2 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30 -; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35 -; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3 -; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6 -; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, s4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s4, v43, 6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, s4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s4, v43, 0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: s_mov_b32 s30, s4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s4, v43, 4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62 @@ -176782,7 +176440,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80 @@ -176794,6 +176451,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 @@ -176828,12 +176487,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3 ; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4 ; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5 -; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7 ; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7 ; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8 -; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s34, s4 ; GFX11-FAKE16-NEXT: .LBB91_5: ; %end ; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8 @@ -191761,9 +191418,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -191804,15 +191460,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen @@ -191826,15 +191480,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -191850,13 +191503,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v12 @@ -191871,15 +191523,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -191895,11 +191546,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -192046,16 +191696,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v12 @@ -192073,13 +191721,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -192857,9 +192505,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -192876,9 +192523,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -192920,12 +192566,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -192946,12 +192592,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -192972,12 +192618,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -192998,12 +192644,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -193024,12 +192670,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -193050,12 +192696,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -194268,8 +193914,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -194334,149 +193980,152 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 +; SI-NEXT: v_mov_b32_e32 v31, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v44, v18 +; SI-NEXT: v_mov_b32_e32 v33, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v46, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v20, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v48 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v60 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v52 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v53 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v54 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB95_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v22 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_readfirstlane_b32 s5, v20 ; SI-NEXT: s_or_b32 s44, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_readfirstlane_b32 s5, v19 ; SI-NEXT: s_or_b32 s45, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 4 @@ -194487,13 +194136,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v28 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: v_readfirstlane_b32 s5, v27 ; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v13 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: v_readfirstlane_b32 s5, v24 ; SI-NEXT: s_or_b32 s43, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 10 @@ -194504,14 +194153,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_or_b32 s40, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v29 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v39 +; SI-NEXT: v_readfirstlane_b32 s5, v15 ; SI-NEXT: s_or_b32 s41, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 16 @@ -194522,32 +194171,37 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v23 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: v_readfirstlane_b32 s5, v21 +; SI-NEXT: v_mov_b32_e32 v19, v30 ; SI-NEXT: s_or_b32 s28, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v43 ; SI-NEXT: v_readfirstlane_b32 s46, v55 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v40, v34 -; SI-NEXT: v_mov_b32_e32 v41, v21 -; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_mov_b32_e32 v39, v47 +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v52, v45 ; SI-NEXT: s_lshr_b32 s71, s45, 8 ; SI-NEXT: s_lshr_b32 s70, s43, 8 ; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: v_bfe_u32 v38, v47, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v33, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v32, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v21, 8, 8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_bfe_u32 v23, v6, 8, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s5, v16 ; SI-NEXT: s_or_b32 s29, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 22 @@ -194558,15 +194212,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v43 ; SI-NEXT: s_or_b32 s26, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v21 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v43 +; SI-NEXT: v_readfirstlane_b32 s5, v42 ; SI-NEXT: s_or_b32 s27, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 28 @@ -194576,21 +194229,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_mov_b32_e32 v43, v42 ; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s68, s29, 8 ; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: v_bfe_u32 v43, v31, 8, 8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v15 ; SI-NEXT: s_or_b32 s24, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_or_b32 s25, s5, s4 @@ -194601,15 +194252,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v29 +; SI-NEXT: v_readfirstlane_b32 s5, v26 +; SI-NEXT: v_mov_b32_e32 v12, v26 ; SI-NEXT: s_or_b32 s22, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_readfirstlane_b32 s5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v6 ; SI-NEXT: s_or_b32 s23, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 40 @@ -194620,107 +194275,95 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_readfirstlane_b32 s4, v18 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_readfirstlane_b32 s5, v14 ; SI-NEXT: s_or_b32 s20, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v42 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_readfirstlane_b32 s5, v59 ; SI-NEXT: s_or_b32 s21, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 ; SI-NEXT: v_writelane_b32 v62, s4, 44 ; SI-NEXT: v_writelane_b32 v62, s5, 45 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 -; SI-NEXT: v_mov_b32_e32 v58, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v7, v29 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v41 ; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v9 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v21 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v24 -; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_mov_b32_e32 v1, v53 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_readfirstlane_b32 s5, v34 +; SI-NEXT: v_mov_b32_e32 v3, v41 +; SI-NEXT: v_mov_b32_e32 v41, v34 ; SI-NEXT: s_lshr_b32 s64, s25, 8 ; SI-NEXT: s_lshr_b32 s54, s23, 8 ; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 ; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: v_mov_b32_e32 v13, v12 -; SI-NEXT: v_bfe_u32 v24, v12, 8, 8 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_bfe_u32 v48, v48, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v59, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, v40 +; SI-NEXT: v_bfe_u32 v34, v19, 8, 8 +; SI-NEXT: v_mov_b32_e32 v14, v16 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s19, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v11 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_bfe_u32 v18, v11, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: s_or_b32 s16, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v47 +; SI-NEXT: v_mov_b32_e32 v27, v1 +; SI-NEXT: s_lshr_b32 s50, s19, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_readfirstlane_b32 s4, v25 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v42 +; SI-NEXT: v_readfirstlane_b32 s5, v44 ; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v8 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v46 +; SI-NEXT: v_readfirstlane_b32 s5, v10 ; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v46 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: v_readfirstlane_b32 s5, v33 ; SI-NEXT: s_or_b32 s12, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v36 ; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v28 +; SI-NEXT: v_readfirstlane_b32 s5, v32 ; SI-NEXT: s_or_b32 s10, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v45 +; SI-NEXT: v_readfirstlane_b32 s5, v2 ; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v45 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v57 +; SI-NEXT: v_readfirstlane_b32 s5, v56 ; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: v_readfirstlane_b32 s5, v57 ; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v61 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v25 +; SI-NEXT: v_readfirstlane_b32 s5, v58 ; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v50 +; SI-NEXT: v_readfirstlane_b32 s5, v60 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v54 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v53 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v54 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_or_b32 s5, s46, s5 ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 @@ -194728,6 +194371,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 ; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: v_bfe_u32 v37, v1, 8, 8 +; SI-NEXT: v_mov_b32_e32 v1, v54 ; SI-NEXT: s_lshr_b32 s48, s17, 8 ; SI-NEXT: s_lshr_b32 s67, s15, 8 ; SI-NEXT: s_lshr_b32 s65, s13, 8 @@ -194735,6 +194380,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b32 s53, s9, 8 ; SI-NEXT: s_lshr_b32 s51, s7, 8 ; SI-NEXT: s_lshr_b32 s49, s5, 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 ; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 @@ -194756,171 +194404,166 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 ; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_mov_b32_e32 v57, v30 -; SI-NEXT: v_bfe_u32 v50, v30, 8, 8 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v33 -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v54, v9 -; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v9, v40, 8, 8 +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: v_bfe_u32 v47, v13, 8, 8 +; SI-NEXT: v_mov_b32_e32 v13, v29 +; SI-NEXT: v_bfe_u32 v53, v29, 8, 8 +; SI-NEXT: v_mov_b32_e32 v29, v21 +; SI-NEXT: v_bfe_u32 v36, v17, 8, 8 +; SI-NEXT: v_bfe_u32 v21, v42, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v7, 8, 8 +; SI-NEXT: v_mov_b32_e32 v40, v8 +; SI-NEXT: v_bfe_u32 v56, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v2, v5, 8, 8 +; SI-NEXT: v_mov_b32_e32 v54, v4 ; SI-NEXT: v_bfe_u32 v45, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v52, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v1, 8, 8 -; SI-NEXT: v_mov_b32_e32 v3, v14 -; SI-NEXT: v_mov_b32_e32 v25, v59 -; SI-NEXT: v_mov_b32_e32 v1, v52 -; SI-NEXT: v_mov_b32_e32 v44, v11 +; SI-NEXT: v_bfe_u32 v44, v38, 8, 8 +; SI-NEXT: v_bfe_u32 v58, v51, 8, 8 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_bfe_u32 v46, v1, 8, 8 +; SI-NEXT: v_mov_b32_e32 v1, v51 +; SI-NEXT: v_mov_b32_e32 v10, v17 ; SI-NEXT: s_branch .LBB95_3 ; SI-NEXT: .LBB95_2: -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v7, v29 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v12, v26 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v20, v43 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v40 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v52, v45 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_mov_b32_e32 v50, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v49, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v48, v9 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_mov_b32_e32 v39, v47 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v3, v41 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v57, v30 +; SI-NEXT: v_mov_b32_e32 v41, v34 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_mov_b32_e32 v57, v13 ; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v13, v29 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v29, v6 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v28, v8 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v5 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v54, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v22, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v1, v51 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v16, v38 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v25, v59 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v1, v52 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 @@ -194945,6 +194588,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 44 ; SI-NEXT: v_writelane_b32 v62, s5, 45 ; SI-NEXT: v_writelane_b32 v62, s80, 46 @@ -194955,143 +194607,146 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s81, 49 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v14, v17 -; SI-NEXT: v_mov_b32_e32 v17, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v11, v13 +; SI-NEXT: v_mov_b32_e32 v13, v18 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v2, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v8, v27 +; SI-NEXT: v_mov_b32_e32 v31, v28 +; SI-NEXT: v_mov_b32_e32 v32, v29 ; SI-NEXT: s_cbranch_vccnz .LBB95_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v18, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_mov_b32_e32 v29, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v12 +; SI-NEXT: v_bfe_u32 v53, v11, 8, 8 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_bfe_u32 v50, v57, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v12, 8, 8 -; SI-NEXT: v_bfe_u32 v43, v32, 8, 8 -; SI-NEXT: v_bfe_u32 v24, v13, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v31, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v54, 8, 8 -; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v55, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v53, 8, 8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_bfe_u32 v47, v57, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v21, v42, 8, 8 +; SI-NEXT: v_bfe_u32 v9, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v37, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v56, v40, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v54, 8, 8 +; SI-NEXT: v_bfe_u32 v44, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v58, v1, 8, 8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_bfe_u32 v46, v33, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s5, v15 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v53 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v33 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 ; SI-NEXT: s_lshr_b32 s49, s5, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_mov_b32_e32 v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v60 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v1 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s51, s7, 8 ; SI-NEXT: v_readfirstlane_b32 s8, v14 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s9, v14 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v16 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s10, v14 @@ -195100,24 +194755,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 ; SI-NEXT: s_lshr_b32 s53, s9, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s10, v14 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v55 +; SI-NEXT: v_readfirstlane_b32 s11, v54 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s12, v14 @@ -195127,24 +194780,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 ; SI-NEXT: s_lshr_b32 s55, s11, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s12, v14 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s13, v14 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v55 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s14, v14 @@ -195154,26 +194807,28 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 ; SI-NEXT: s_lshr_b32 s65, s13, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s15, v14 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v40 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 ; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 @@ -195182,19 +194837,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 ; SI-NEXT: s_lshr_b32 s67, s15, 8 ; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readfirstlane_b32 s17, v44 +; SI-NEXT: v_readfirstlane_b32 s17, v7 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 @@ -195203,139 +194858,132 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: s_lshr_b32 s48, s17, 8 ; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readfirstlane_b32 s19, v54 +; SI-NEXT: v_readfirstlane_b32 s19, v8 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 ; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 +; SI-NEXT: s_lshr_b32 s50, s19, 8 ; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s21, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v42 ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 ; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 ; SI-NEXT: s_lshr_b32 s52, s21, 8 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v26 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_readfirstlane_b32 s23, v32 +; SI-NEXT: v_readfirstlane_b32 s23, v10 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 ; SI-NEXT: s_or_b32 s23, s24, s23 ; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: v_bfe_u32 v2, v55, 8, 8 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s25, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: s_or_b32 s24, s25, s24 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v30 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: v_bfe_u32 v35, v30, 8, 8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s25, v31 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 ; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: s_or_b32 s25, s26, s25 ; SI-NEXT: s_lshr_b32 s64, s25, 8 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s27, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 ; SI-NEXT: s_or_b32 s26, s27, s26 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 -; SI-NEXT: v_bfe_u32 v18, v44, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s27, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 +; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: v_bfe_u32 v37, v46, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v32, 8, 8 ; SI-NEXT: v_readfirstlane_b32 s28, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s27, s28, s27 ; SI-NEXT: s_lshr_b32 s66, s27, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s28, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: s_or_b32 s28, s29, s28 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s29, v56 +; SI-NEXT: v_readfirstlane_b32 s29, v30 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: v_bfe_u32 v38, v56, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v30, 8, 8 ; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s29, s40, s29 ; SI-NEXT: s_lshr_b32 s68, s29, 8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -195343,62 +194991,60 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s41, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s40, s41, s40 -; SI-NEXT: v_readfirstlane_b32 s41, v57 +; SI-NEXT: v_readfirstlane_b32 s41, v11 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_lshl_b32 s41, s41, 16 ; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s41, s42, s41 ; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s42, v14 ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s43, v14 ; SI-NEXT: s_or_b32 s42, s43, s42 -; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s43, v57 ; SI-NEXT: s_lshl_b32 s43, s43, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s43, s44, s43 ; SI-NEXT: s_lshr_b32 s70, s43, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_bfe_u32 v47, v1, 8, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s44, s45, s44 ; SI-NEXT: v_readfirstlane_b32 s45, v13 ; SI-NEXT: s_lshl_b32 s45, s45, 16 @@ -195485,203 +195131,188 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 ; SI-NEXT: s_lshr_b32 s71, s45, 8 ; SI-NEXT: .LBB95_5: ; %end -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 -; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 +; SI-NEXT: v_readlane_b32 s47, v62, 0 +; SI-NEXT: s_lshl_b32 s47, s47, 8 ; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 ; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 -; SI-NEXT: s_lshl_b32 s57, vcc_lo, 24 +; SI-NEXT: v_readlane_b32 s47, v62, 2 +; SI-NEXT: s_and_b32 s47, s47, 0xff +; SI-NEXT: v_readlane_b32 s57, v62, 4 +; SI-NEXT: s_lshl_b32 s57, s57, 24 ; SI-NEXT: s_lshl_b32 s47, s47, 16 ; SI-NEXT: s_or_b32 s47, s57, s47 ; SI-NEXT: s_and_b32 s44, s44, 0xffff ; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v16, s44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mov_b32_e32 v14, s44 ; SI-NEXT: s_and_b32 s44, s45, 0xff ; SI-NEXT: s_lshl_b32 s45, s71, 8 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v9 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v18, v13 ; SI-NEXT: v_or_b32_e32 v13, s44, v13 ; SI-NEXT: v_readlane_b32 s44, v62, 6 -; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_lshl_b32 s44, s44, 8 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: s_or_b32 s42, s42, s44 ; SI-NEXT: v_readlane_b32 s44, v62, 8 -; SI-NEXT: v_readlane_b32 s45, v62, 9 ; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 -; SI-NEXT: s_lshl_b32 s45, vcc_lo, 24 +; SI-NEXT: v_readlane_b32 s45, v62, 10 +; SI-NEXT: s_lshl_b32 s45, s45, 24 ; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_or_b32 s44, s45, s44 ; SI-NEXT: s_and_b32 s42, s42, 0xffff ; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_mov_b32_e32 v18, s42 ; SI-NEXT: s_and_b32 s42, s43, 0xff ; SI-NEXT: s_lshl_b32 s43, s70, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v47 ; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v12, v25, v12 ; SI-NEXT: v_or_b32_e32 v12, s42, v12 ; SI-NEXT: v_readlane_b32 s42, v62, 12 -; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_lshl_b32 s42, s42, 8 ; SI-NEXT: s_and_b32 s40, s40, 0xff ; SI-NEXT: s_or_b32 s40, s40, s42 ; SI-NEXT: v_readlane_b32 s42, v62, 14 -; SI-NEXT: v_readlane_b32 s43, v62, 15 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 s44, v62, 16 -; SI-NEXT: s_lshl_b32 s43, s44, 24 +; SI-NEXT: v_readlane_b32 s43, v62, 16 +; SI-NEXT: s_lshl_b32 s43, s43, 24 ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_or_b32 s42, s43, s42 ; SI-NEXT: s_and_b32 s40, s40, 0xffff ; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: v_mov_b32_e32 v25, s40 ; SI-NEXT: s_and_b32 s40, s41, 0xff ; SI-NEXT: s_lshl_b32 s41, s69, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_or_b32 s40, s40, s41 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v50 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v53 ; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_or_b32_e32 v11, v26, v11 ; SI-NEXT: v_or_b32_e32 v11, s40, v11 ; SI-NEXT: v_readlane_b32 s40, v62, 18 -; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_lshl_b32 s40, s40, 8 ; SI-NEXT: s_and_b32 s28, s28, 0xff ; SI-NEXT: s_or_b32 s28, s28, s40 ; SI-NEXT: v_readlane_b32 s40, v62, 20 -; SI-NEXT: v_readlane_b32 s41, v62, 21 ; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: v_readlane_b32 s42, v62, 22 -; SI-NEXT: s_lshl_b32 s41, s42, 24 +; SI-NEXT: v_readlane_b32 s41, v62, 22 +; SI-NEXT: s_lshl_b32 s41, s41, 24 ; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_or_b32 s40, s41, s40 ; SI-NEXT: s_and_b32 s28, s28, 0xffff ; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s28 ; SI-NEXT: s_and_b32 s28, s29, 0xff ; SI-NEXT: s_lshl_b32 s29, s68, 8 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; SI-NEXT: s_or_b32 s28, s28, s29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v34 ; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v25, s28, v25 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v27, s28, v27 ; SI-NEXT: v_readlane_b32 s28, v62, 24 -; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_lshl_b32 s28, s28, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff ; SI-NEXT: s_or_b32 s26, s26, s28 ; SI-NEXT: v_readlane_b32 s28, v62, 26 -; SI-NEXT: v_readlane_b32 s29, v62, 27 ; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: v_readlane_b32 s40, v62, 28 -; SI-NEXT: s_lshl_b32 s29, s40, 24 +; SI-NEXT: v_readlane_b32 s29, v62, 28 +; SI-NEXT: s_lshl_b32 s29, s29, 24 ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s28, s29, s28 ; SI-NEXT: s_and_b32 s26, s26, 0xffff ; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_mov_b32_e32 v26, s26 +; SI-NEXT: v_mov_b32_e32 v28, s26 ; SI-NEXT: s_and_b32 s26, s27, 0xff ; SI-NEXT: s_lshl_b32 s27, s66, 8 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v32 ; SI-NEXT: s_or_b32 s26, s26, s27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 ; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_or_b32_e32 v27, s26, v27 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_or_b32_e32 v24, s26, v24 ; SI-NEXT: v_readlane_b32 s26, v62, 30 -; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_lshl_b32 s26, s26, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_or_b32 s24, s24, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: v_readlane_b32 s28, v62, 34 -; SI-NEXT: s_lshl_b32 s27, s28, 24 +; SI-NEXT: v_readlane_b32 s27, v62, 34 +; SI-NEXT: s_lshl_b32 s27, s27, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 ; SI-NEXT: s_or_b32 s26, s27, s26 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v28, s24 +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v29, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xff ; SI-NEXT: s_lshl_b32 s25, s64, 8 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 -; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v30, 0xff, v31 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v35 -; SI-NEXT: buffer_store_dword v21, v13, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: buffer_store_dword v18, v13, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v23, v23, v30 ; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v29, s24, v29 -; SI-NEXT: buffer_store_dword v23, v12, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v23, s24, v23 +; SI-NEXT: buffer_store_dword v25, v12, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 ; SI-NEXT: v_readlane_b32 s24, v62, 36 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: v_readlane_b32 s24, v62, 38 -; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 39 ; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: v_readlane_b32 s26, v62, 40 -; SI-NEXT: buffer_store_dword v26, v11, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s25, v62, 40 +; SI-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_lshl_b32 s25, s26, 24 -; SI-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s25, s25, 24 +; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v29, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v29, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v23, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 ; SI-NEXT: v_mov_b32_e32 v12, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xff ; SI-NEXT: s_lshl_b32 s23, s54, 8 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v36 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v10, s22, v10 @@ -195689,11 +195320,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshl_b32 s22, s62, 8 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: v_readlane_b32 s22, v62, 42 -; SI-NEXT: v_readlane_b32 s23, v62, 43 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: v_readlane_b32 s24, v62, 44 +; SI-NEXT: v_readlane_b32 s23, v62, 44 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_lshl_b32 s23, s24, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 24 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 @@ -195704,11 +195334,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v11, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xff ; SI-NEXT: s_lshl_b32 s21, s52, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v42 ; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v21 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v9, s20, v9 @@ -195728,11 +195358,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff ; SI-NEXT: s_lshl_b32 s19, s50, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v37 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v8, s18, v8 @@ -195752,11 +195382,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v9, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff ; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v19 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v7, s16, v7 @@ -195776,11 +195406,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v8, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff ; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 ; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v56 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v6, s14, v6 @@ -195800,11 +195430,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff ; SI-NEXT: s_lshl_b32 s13, s65, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v55 ; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v2 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v5, s12, v5 @@ -195824,7 +195454,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff ; SI-NEXT: s_lshl_b32 s11, s55, 8 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v54 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -195848,11 +195478,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s53, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 ; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v44 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_or_b32_e32 v2, s8, v2 @@ -195876,19 +195506,18 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v58 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s6, v1 ; SI-NEXT: v_readlane_b32 s6, v62, 46 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 47 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s80, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 48 +; SI-NEXT: v_readlane_b32 s7, v62, 48 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 @@ -195900,11 +195529,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s49, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v46 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -195925,13 +195554,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s30, v63, 34 -; SI-NEXT: v_readlane_b32 s45, v62, 17 -; SI-NEXT: v_readlane_b32 s43, v62, 23 -; SI-NEXT: v_readlane_b32 s41, v62, 29 -; SI-NEXT: v_readlane_b32 s29, v62, 35 -; SI-NEXT: v_readlane_b32 s27, v62, 41 -; SI-NEXT: v_readlane_b32 s25, v62, 45 -; SI-NEXT: v_readlane_b32 s9, v62, 49 ; SI-NEXT: v_readlane_b32 s31, v63, 35 ; SI-NEXT: v_readlane_b32 s99, v63, 33 ; SI-NEXT: v_readlane_b32 s98, v63, 32 @@ -195968,8 +195590,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_readlane_b32 s35, v63, 1 ; SI-NEXT: v_readlane_b32 s34, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -197104,16 +196726,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 @@ -197255,16 +196875,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 @@ -197284,16 +196902,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 @@ -198225,20 +197841,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v55, v19 ; GFX9-NEXT: v_mov_b32_e32 v19, s81 ; GFX9-NEXT: .LBB95_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 @@ -198296,26 +197912,23 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_readlane_b32 s35, v63, 1 ; GFX9-NEXT: v_readlane_b32 s34, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v42, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -198328,9 +197941,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198347,9 +197959,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198364,11 +197975,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -198384,9 +197995,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198403,9 +198013,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198422,9 +198031,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198441,13 +198049,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 @@ -198456,13 +198063,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198489,9 +198095,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198514,9 +198119,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198539,9 +198143,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198564,9 +198167,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198589,9 +198191,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -198608,8 +198209,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -199315,10 +198915,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB95_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 @@ -214397,16 +213996,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 @@ -214426,16 +214023,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 @@ -214455,16 +214050,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 @@ -214484,16 +214077,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 @@ -214513,16 +214104,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 @@ -214547,8 +214136,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -214638,8 +214226,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -214660,16 +214247,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 @@ -214689,16 +214274,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 @@ -214719,16 +214302,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 @@ -214753,8 +214334,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -215533,9 +215113,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -215552,9 +215131,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -215596,12 +215174,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -215622,12 +215200,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -215648,12 +215226,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -215674,12 +215252,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -215700,12 +215278,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -215726,12 +215304,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -217831,12 +217409,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: .LBB99_3: ; %end ; SI-NEXT: s_lshl_b32 s17, s92, 8 ; SI-NEXT: s_and_b32 s18, s60, 0xff +; SI-NEXT: v_readlane_b32 s16, v41, 20 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 20 -; SI-NEXT: v_readlane_b32 s19, v41, 21 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 22 -; SI-NEXT: s_lshl_b32 s19, s20, 24 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v41, 22 +; SI-NEXT: s_lshl_b32 s19, s16, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s17, s17, 0xffff @@ -217854,18 +217431,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v2, s17 ; SI-NEXT: v_readlane_b32 s16, v41, 24 -; SI-NEXT: v_readlane_b32 s17, v41, 25 +; SI-NEXT: v_mov_b32_e32 v2, s17 ; SI-NEXT: s_lshl_b32 s17, s16, 8 ; SI-NEXT: s_and_b32 s18, s56, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 23 +; SI-NEXT: v_readlane_b32 s16, v41, 26 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 26 -; SI-NEXT: v_readlane_b32 s19, v41, 27 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 28 -; SI-NEXT: s_lshl_b32 s19, s20, 24 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v41, 28 +; SI-NEXT: s_lshl_b32 s19, s16, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s17, s17, 0xffff @@ -218001,13 +217575,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: s_lshl_b32 s17, s90, 8 ; SI-NEXT: s_and_b32 s18, s46, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 29 +; SI-NEXT: v_readlane_b32 s16, v41, 30 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 30 -; SI-NEXT: v_readlane_b32 s19, v41, 31 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 32 -; SI-NEXT: s_lshl_b32 s19, s20, 24 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v41, 32 +; SI-NEXT: s_lshl_b32 s19, s16, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s17, s17, 0xffff @@ -218020,84 +217592,79 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s16, 0xff ; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: v_readlane_b32 s17, v41, 35 -; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v41, 34 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v16, s17 ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 37 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 38 +; SI-NEXT: v_readlane_b32 s17, v41, 36 ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 38 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 42 -; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 42 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v41, 41 +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: v_readlane_b32 s17, v41, 40 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 43 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s17, v41, 42 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 44 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -218126,16 +217693,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 46 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v41, 47 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v41, 45 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 48 -; SI-NEXT: v_readlane_b32 s17, v41, 49 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 50 +; SI-NEXT: v_readlane_b32 s17, v41, 50 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 @@ -218161,15 +217725,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 52 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v41, 53 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 54 -; SI-NEXT: v_readlane_b32 s15, v41, 55 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v41, 56 +; SI-NEXT: v_readlane_b32 s15, v41, 56 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 @@ -218195,15 +217757,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 58 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v41, 59 ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 60 -; SI-NEXT: v_readlane_b32 s13, v41, 61 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v41, 62 +; SI-NEXT: v_readlane_b32 s13, v41, 62 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s14, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 @@ -218229,15 +217789,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 0 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v43, 1 ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 2 -; SI-NEXT: v_readlane_b32 s11, v43, 3 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v43, 4 +; SI-NEXT: v_readlane_b32 s11, v43, 4 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 @@ -218263,15 +217821,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 6 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 9 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v43, 10 +; SI-NEXT: v_readlane_b32 s9, v43, 10 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 @@ -218297,15 +217853,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 14 -; SI-NEXT: v_readlane_b32 s7, v43, 15 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v43, 16 +; SI-NEXT: v_readlane_b32 s7, v43, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 @@ -218330,13 +217884,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s30, v40, 34 -; SI-NEXT: v_readlane_b32 s21, v41, 33 -; SI-NEXT: v_readlane_b32 s19, v41, 51 -; SI-NEXT: v_readlane_b32 s17, v41, 57 -; SI-NEXT: v_readlane_b32 s15, v41, 63 -; SI-NEXT: v_readlane_b32 s13, v43, 5 -; SI-NEXT: v_readlane_b32 s11, v43, 11 -; SI-NEXT: v_readlane_b32 s9, v43, 17 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s31, v40, 35 ; SI-NEXT: v_readlane_b32 s99, v40, 33 @@ -220667,20 +220214,20 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v55, v19 ; GFX9-NEXT: v_mov_b32_e32 v19, s81 ; GFX9-NEXT: .LBB99_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 @@ -220738,26 +220285,23 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s35, v63, 1 ; GFX9-NEXT: v_readlane_b32 s34, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v42, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -220770,9 +220314,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220789,9 +220332,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220806,11 +220348,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -220826,9 +220368,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220845,9 +220386,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220864,9 +220404,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220883,13 +220422,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 @@ -220898,13 +220436,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220931,9 +220468,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220956,9 +220492,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -220981,9 +220516,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -221006,9 +220540,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -221031,9 +220564,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -221050,8 +220582,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -221757,10 +221288,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB99_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 @@ -236973,11 +236503,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -237003,11 +236533,12 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 @@ -237032,11 +236563,12 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -237095,11 +236627,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -237113,11 +236645,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -237131,11 +236663,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -237149,11 +236681,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -237216,10 +236748,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -243346,12 +242878,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -243363,24 +242894,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v54, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_mov_b32_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -243390,13 +242904,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v26, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -243406,10 +242915,25 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v22, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -243421,7 +242945,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -243433,6 +242963,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -243441,17 +242975,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 ; SI-NEXT: v_or_b32_e32 v43, v3, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 @@ -243480,20 +243014,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshr_b64 v[62:63], v[38:39], 16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 ; SI-NEXT: v_or_b32_e32 v4, v3, v4 ; SI-NEXT: v_mov_b32_e32 v63, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 @@ -243738,11 +243265,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -243788,11 +243315,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -243804,11 +243331,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -243818,12 +243345,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 1a76b23df5bb5..9a1e98dd114a1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -25433,7 +25433,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s30, v63, 18 ; VI-NEXT: v_readlane_b32 s31, v63, 19 ; VI-NEXT: v_readlane_b32 s67, v63, 17 @@ -25454,7 +25453,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_readlane_b32 s36, v63, 2 ; VI-NEXT: v_readlane_b32 s35, v63, 1 ; VI-NEXT: v_readlane_b32 s34, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25470,8 +25469,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25866,7 +25864,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s30, v63, 14 ; GFX9-NEXT: v_readlane_b32 s31, v63, 15 ; GFX9-NEXT: v_readlane_b32 s55, v63, 13 @@ -25883,7 +25880,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s36, v63, 2 ; GFX9-NEXT: v_readlane_b32 s35, v63, 1 ; GFX9-NEXT: v_readlane_b32 s34, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25897,8 +25894,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -64743,10 +64739,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -64765,10 +64761,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -66036,14 +66032,12 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s79, s5, 8 ; SI-NEXT: v_writelane_b32 v21, s17, 13 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: v_readlane_b32 s18, v21, 0 -; SI-NEXT: v_readlane_b32 s19, v21, 1 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v21, 2 +; SI-NEXT: v_readlane_b32 s17, v21, 0 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v21, 3 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s17, v21, 2 +; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: v_readlane_b32 s18, v21, 4 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -66063,16 +66057,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v21, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v21, 5 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 8 -; SI-NEXT: v_readlane_b32 s17, v21, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 10 +; SI-NEXT: v_readlane_b32 s17, v21, 10 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -66097,15 +66088,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 12 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s15, v21, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s10, s10, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 14 -; SI-NEXT: v_readlane_b32 s15, v21, 15 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v21, 16 +; SI-NEXT: v_readlane_b32 s15, v21, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 @@ -66257,8 +66246,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s30, v20, 34 -; SI-NEXT: v_readlane_b32 s19, v21, 11 -; SI-NEXT: v_readlane_b32 s17, v21, 17 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s31, v20, 35 ; SI-NEXT: v_readlane_b32 s99, v20, 33 @@ -67218,7 +67205,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s30, v63, 14 ; GFX9-NEXT: v_readlane_b32 s31, v63, 15 ; GFX9-NEXT: v_readlane_b32 s55, v63, 13 @@ -67235,7 +67221,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s36, v63, 2 ; GFX9-NEXT: v_readlane_b32 s35, v63, 1 ; GFX9-NEXT: v_readlane_b32 s34, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -67249,8 +67235,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -70706,11 +70691,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_waitcnt expcnt(0) @@ -70722,13 +70704,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_mov_b32_e32 v10, v38 ; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v14, v34 ; SI-NEXT: v_mov_b32_e32 v16, v48 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v18, v49 ; SI-NEXT: v_mov_b32_e32 v20, v35 ; SI-NEXT: v_mov_b32_e32 v22, v36 @@ -70736,6 +70715,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v26, v51 ; SI-NEXT: v_mov_b32_e32 v28, v54 ; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: v_mov_b32_e32 v39, v32 @@ -79282,7 +79262,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s30, v63, 18 @@ -79305,7 +79284,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_readlane_b32 s36, v63, 2 ; VI-NEXT: v_readlane_b32 s35, v63, 1 ; VI-NEXT: v_readlane_b32 s34, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79319,10 +79298,9 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79720,7 +79698,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s30, v63, 14 ; GFX9-NEXT: v_readlane_b32 s31, v63, 15 ; GFX9-NEXT: v_readlane_b32 s55, v63, 13 @@ -79737,7 +79714,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s36, v63, 2 ; GFX9-NEXT: v_readlane_b32 s35, v63, 1 ; GFX9-NEXT: v_readlane_b32 s34, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79751,8 +79728,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88765,7 +88741,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s30, v63, 18 @@ -88788,7 +88763,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_readlane_b32 s36, v63, 2 ; VI-NEXT: v_readlane_b32 s35, v63, 1 ; VI-NEXT: v_readlane_b32 s34, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88802,10 +88777,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 1ef8f2430428e..11489b835a3b9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -53826,13 +53826,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v38, v55 +; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -53859,13 +53858,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -53875,9 +53872,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v30, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 @@ -53886,17 +53885,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -53907,6 +53898,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -54235,12 +54235,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -54262,12 +54261,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir index b5c3e3214f125..2c809eba09987 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir @@ -42,7 +42,7 @@ body: | ; GCN8-NEXT: S_NOP 7 ; GCN8-NEXT: S_NOP 0 ; GCN8-NEXT: S_NOP 7 - ; GCN8-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + ; GCN8-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 ; GCN8-NEXT: S_NOP 7 ; GCN8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec ; GCN8-NEXT: S_NOP 7 @@ -84,7 +84,7 @@ body: | ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: S_NOP 0 ; GCN16-NEXT: S_NOP 15 - ; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + ; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec ; GCN16-NEXT: S_NOP 15 @@ -110,7 +110,7 @@ body: | liveins: $sgpr6, $sgpr10_sgpr11 SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 - renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec S_BRANCH %bb.3 bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir b/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir index 72b6b9f9ec686..c4d908b62af2d 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/bug-undef-spilled-agpr.mir @@ -93,7 +93,7 @@ body: | bb.3: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - $sgpr14 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + $sgpr14 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) ADJCALLSTACKDOWN 0, 28, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir index 5c7c07632f0d5..86716368b62eb 100644 --- a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir +++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir @@ -27,7 +27,7 @@ body: | ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 @@ -39,7 +39,7 @@ body: | ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) - ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 undef %1.sub0:vreg_128 = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir index 2f43c8264bf90..a1d87244e3df7 100644 --- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir +++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir @@ -34,7 +34,7 @@ body: | ; GCN-NEXT: SI_RETURN SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 - renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec SI_RETURN @@ -99,7 +99,7 @@ body: | liveins: $sgpr6, $sgpr10_sgpr11 SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 - renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec S_BRANCH %bb.3 bb.3: @@ -186,7 +186,7 @@ body: | liveins: $sgpr4, $sgpr10_sgpr11 SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 - renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec S_BRANCH %bb.2 bb.2: @@ -261,7 +261,7 @@ body: | liveins: $sgpr4, $vgpr2_vgpr3 SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 - renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir b/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir index 01cd5f74dcb48..65d99313d26fa 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir @@ -51,7 +51,7 @@ body: | ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY $vcc ; CHECK-NEXT: SI_SPILL_S64_SAVE $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY $vcc - ; CHECK-NEXT: $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: $vcc = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, 3, killed $sgpr4_sgpr5, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit killed $vgpr0, implicit killed renamable $vcc %0:vgpr_32 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir index f9af0a03e51d0..99e2905d37368 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir @@ -55,7 +55,7 @@ body: | ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 8fcde89adf752..fdeb149edc4b1 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -3330,6 +3330,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v7 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 @@ -3337,9 +3338,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 ; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_mov_b32_e32 v10, v21 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v21 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 @@ -3364,10 +3364,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3 ; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 ; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v9, v20 ; GFX11-NEXT: scratch_store_b32 off, v11, s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v11, v22 +; GFX11-NEXT: v_dual_mov_b32 v9, v20 :: v_dual_mov_b32 v8, v19 +; GFX11-NEXT: v_mov_b32_e32 v29, v45 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x80 ; GFX11-NEXT: v_mov_b32_e32 v5, v16 @@ -3380,33 +3380,35 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_mov_b32_e32 v7, v18 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 +; GFX11-NEXT: v_mov_b32_e32 v12, v23 ; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x50 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 +; GFX11-NEXT: v_mov_b32_e32 v14, v25 ; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 64 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 +; GFX11-NEXT: v_mov_b32_e32 v15, v26 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v14, v25 +; GFX11-NEXT: v_mov_b32_e32 v16, v27 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v16, v27 +; GFX11-NEXT: v_mov_b32_e32 v30, v46 ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2 ; GFX11-NEXT: s_add_i32 s2, s32, 16 -; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: v_mov_b32_e32 v31, v47 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload +; GFX11-NEXT: scratch_load_b96 v[0:2], off, s33 offset:1588 ; 12-byte Folded Reload +; GFX11-NEXT: s_add_i32 s2, s33, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v13, v24 +; GFX11-NEXT: v_mov_b32_e32 v11, v22 +; GFX11-NEXT: s_clause 0x2 ; 48-byte Folded Reload ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 -; GFX11-NEXT: s_add_i32 s2, s33, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v59, off, s33 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index da1175c02e94a..61c908366c7f7 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -85,7 +85,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec @@ -128,7 +128,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr64, 1, implicit-def dead $scc - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 424aaaea11722..884024bebf865 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -109,25 +109,18 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s52, v7, 16 -; CHECK-NEXT: v_readlane_b32 s60, v7, 24 -; CHECK-NEXT: v_readlane_b32 s61, v7, 25 -; CHECK-NEXT: v_readlane_b32 s62, v7, 26 -; CHECK-NEXT: v_readlane_b32 s63, v7, 27 -; CHECK-NEXT: v_readlane_b32 s64, v7, 28 -; CHECK-NEXT: v_readlane_b32 s65, v7, 29 -; CHECK-NEXT: v_readlane_b32 s66, v7, 30 -; CHECK-NEXT: v_readlane_b32 s67, v7, 31 +; CHECK-NEXT: v_readlane_b32 s56, v7, 24 +; CHECK-NEXT: v_readlane_b32 s57, v7, 25 +; CHECK-NEXT: v_readlane_b32 s58, v7, 26 +; CHECK-NEXT: v_readlane_b32 s59, v7, 27 +; CHECK-NEXT: v_readlane_b32 s60, v7, 28 +; CHECK-NEXT: v_readlane_b32 s61, v7, 29 +; CHECK-NEXT: v_readlane_b32 s62, v7, 30 +; CHECK-NEXT: v_readlane_b32 s63, v7, 31 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s53, v7, 17 -; CHECK-NEXT: v_readlane_b32 s54, v7, 18 -; CHECK-NEXT: v_readlane_b32 s55, v7, 19 -; CHECK-NEXT: v_readlane_b32 s56, v7, 20 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[56:63], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: v_readlane_b32 s57, v7, 21 -; CHECK-NEXT: v_readlane_b32 s58, v7, 22 -; CHECK-NEXT: v_readlane_b32 s59, v7, 23 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -151,48 +144,34 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v0, s16 -; CHECK-NEXT: v_readlane_b32 s44, v7, 16 -; CHECK-NEXT: v_mov_b32_e32 v1, s17 +; CHECK-NEXT: v_mov_b32_e32 v2, s16 +; CHECK-NEXT: v_readlane_b32 s56, v7, 16 +; CHECK-NEXT: v_mov_b32_e32 v3, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: v_readlane_b32 s45, v7, 17 -; CHECK-NEXT: v_readlane_b32 s46, v7, 18 -; CHECK-NEXT: v_readlane_b32 s47, v7, 19 -; CHECK-NEXT: v_readlane_b32 s48, v7, 20 -; CHECK-NEXT: v_readlane_b32 s49, v7, 21 -; CHECK-NEXT: v_readlane_b32 s50, v7, 22 -; CHECK-NEXT: v_readlane_b32 s51, v7, 23 -; CHECK-NEXT: v_readlane_b32 s52, v7, 24 -; CHECK-NEXT: v_readlane_b32 s53, v7, 25 -; CHECK-NEXT: v_readlane_b32 s54, v7, 26 -; CHECK-NEXT: v_readlane_b32 s55, v7, 27 -; CHECK-NEXT: v_readlane_b32 s56, v7, 28 -; CHECK-NEXT: v_readlane_b32 s57, v7, 29 -; CHECK-NEXT: v_readlane_b32 s58, v7, 30 -; CHECK-NEXT: v_readlane_b32 s59, v7, 31 -; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s44, v7, 0 -; CHECK-NEXT: v_readlane_b32 s52, v7, 8 -; CHECK-NEXT: v_readlane_b32 s53, v7, 9 -; CHECK-NEXT: v_readlane_b32 s54, v7, 10 -; CHECK-NEXT: v_readlane_b32 s55, v7, 11 -; CHECK-NEXT: v_readlane_b32 s56, v7, 12 -; CHECK-NEXT: v_readlane_b32 s57, v7, 13 -; CHECK-NEXT: v_readlane_b32 s58, v7, 14 -; CHECK-NEXT: v_readlane_b32 s59, v7, 15 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, v3 -; CHECK-NEXT: v_readlane_b32 s45, v7, 1 -; CHECK-NEXT: v_readlane_b32 s46, v7, 2 -; CHECK-NEXT: v_readlane_b32 s47, v7, 3 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s48, v7, 4 -; CHECK-NEXT: v_readlane_b32 s49, v7, 5 -; CHECK-NEXT: v_readlane_b32 s50, v7, 6 -; CHECK-NEXT: v_readlane_b32 s51, v7, 7 +; CHECK-NEXT: v_readlane_b32 s57, v7, 17 +; CHECK-NEXT: v_readlane_b32 s58, v7, 18 +; CHECK-NEXT: v_readlane_b32 s59, v7, 19 +; CHECK-NEXT: v_readlane_b32 s60, v7, 20 +; CHECK-NEXT: v_readlane_b32 s61, v7, 21 +; CHECK-NEXT: v_readlane_b32 s62, v7, 22 +; CHECK-NEXT: v_readlane_b32 s63, v7, 23 +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: image_sample_lz v1, v[2:3], s[56:63], s[16:19] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s56, v7, 8 +; CHECK-NEXT: v_readlane_b32 s57, v7, 9 +; CHECK-NEXT: v_readlane_b32 s58, v7, 10 +; CHECK-NEXT: v_readlane_b32 s59, v7, 11 +; CHECK-NEXT: v_readlane_b32 s60, v7, 12 +; CHECK-NEXT: v_readlane_b32 s61, v7, 13 +; CHECK-NEXT: v_readlane_b32 s62, v7, 14 +; CHECK-NEXT: v_readlane_b32 s63, v7, 15 +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: image_sample_lz v0, v[2:3], s[56:63], s[24:27] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 98658ded03903..929d1c62f4e8c 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -80,7 +80,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) { ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) ; GCN-NEXT: dead [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, [[COPY]](s32), implicit $exec @@ -95,11 +95,11 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) { ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]], killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir index 06c3da09eede9..e5ec0d85ff6ad 100644 --- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir +++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir @@ -1,3 +1,4 @@ +# XFAIL: * # RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s # RUN: FileCheck -check-prefix=ERR %s < %t.err diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir index 215200c770245..341821b479e00 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir @@ -58,7 +58,8 @@ body: | ; CHECK-NEXT: SI_SPILL_S512_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = COPY killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF ; CHECK-NEXT: dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], undef renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: S_BRANCH %bb.2 @@ -82,7 +83,7 @@ body: | ; CHECK-NEXT: bb.7: ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.6 diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir index b8818c5550ad4..e32ffbc0dd00d 100644 --- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir @@ -79,7 +79,7 @@ body: | ; CHECK-NEXT: bb.6: ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 59f4a9d44bbdd..340d3ececd1e2 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -7476,12 +7476,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:240 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index cb17f01853221..f6015975937d6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -7531,21 +7531,16 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) expcnt(1) +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) expcnt(1) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v55, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v56, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v18, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v18 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v20 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 diff --git a/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir b/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir index d406f2932dc96..ca2499cc11987 100644 --- a/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir @@ -218,7 +218,7 @@ body: | ; GCN-NEXT: SI_SPILL_S512_SAVE [[S_LOAD_DWORDX16_IMM1]], %stack.12, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.12, align 4, addrspace 5) ; GCN-NEXT: [[S_LOAD_DWORDX8_IMM7:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[V_READFIRSTLANE_B32_]], 156, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1) ; GCN-NEXT: SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM7]], %stack.8, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.8, align 4, addrspace 5) - ; GCN-NEXT: [[SI_SPILL_S64_RESTORE:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.19, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.19, align 4, addrspace 5) + ; GCN-NEXT: [[SI_SPILL_S64_RESTORE:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.19, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.19, align 4, addrspace 5) ; GCN-NEXT: [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM [[SI_SPILL_S64_RESTORE]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1) ; GCN-NEXT: SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM3]], %stack.7, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.7, addrspace 5) ; GCN-NEXT: SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -226,8 +226,8 @@ body: | ; GCN-NEXT: dead [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY1]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1) ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GCN-NEXT: [[S_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_MOV_B64_1]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1) - ; GCN-NEXT: [[SI_SPILL_S64_RESTORE1:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) - ; GCN-NEXT: undef [[COPY2:%[0-9]+]].sub1:sgpr_64 = COPY [[SI_SPILL_S64_RESTORE1]].sub1 + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, 1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: undef [[COPY2:%[0-9]+]].sub1:sgpr_64 = COPY [[SI_SPILL_S32_RESTORE]] ; GCN-NEXT: [[COPY2:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1 ; GCN-NEXT: S_CBRANCH_SCC1 %bb.10, implicit undef $scc ; GCN-NEXT: S_BRANCH %bb.7 @@ -241,7 +241,7 @@ body: | ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GCN-NEXT: $vcc = COPY [[DEF3]] - ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.9, implicit $vcc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.9, implicit $vcc_lo ; GCN-NEXT: S_BRANCH %bb.8 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.8: @@ -253,21 +253,21 @@ body: | ; GCN-NEXT: successors: %bb.10(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = COPY [[S_MOV_B32_4]] - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.10: ; GCN-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORD_IMM2]], 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.17, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.17, addrspace 5) - ; GCN-NEXT: dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_]], [[SI_SPILL_S32_RESTORE]], implicit-def dead $scc - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.15, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.15, addrspace 5) - ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE1]], 0, implicit-def $scc + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.17, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.17, addrspace 5) + ; GCN-NEXT: dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_]], [[SI_SPILL_S32_RESTORE1]], implicit-def dead $scc + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE2:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.15, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.15, addrspace 5) + ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE2]], 0, implicit-def $scc ; GCN-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF - ; GCN-NEXT: [[SI_SPILL_S64_RESTORE2:%[0-9]+]]:sreg_64_xexec = SI_SPILL_S64_RESTORE %stack.18, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.18, align 4, addrspace 5) - ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S64_RESTORE2]].sub1, 0, implicit-def $scc + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE3:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.18, 1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.18, align 4, addrspace 5) + ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE3]], 0, implicit-def $scc ; GCN-NEXT: dead [[DEF5:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.20, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.20, align 4, addrspace 5) + ; GCN-NEXT: [[SI_SPILL_S256_RESTORE:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.20, 0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.20, align 4, addrspace 5) ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub0 { ; GCN-NEXT: internal [[COPY3]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub2 ; GCN-NEXT: internal [[COPY3]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub4 @@ -284,8 +284,8 @@ body: | ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_7:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub4, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_8:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub5, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_9:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub6, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.14, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.14, align 4, addrspace 5) - ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0_sub1_sub2:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0_sub1_sub2 + ; GCN-NEXT: [[SI_SPILL_S96_RESTORE:%[0-9]+]]:sgpr_96 = SI_SPILL_S96_RESTORE %stack.14, 0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.14, align 4, addrspace 5) + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0_sub1_sub2:sgpr_128 = COPY [[SI_SPILL_S96_RESTORE]] ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_10:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub0, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_11:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub1, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_12:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub2, 0, implicit $mode, implicit $exec @@ -297,7 +297,7 @@ body: | ; GCN-NEXT: dead [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: dead [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: dead [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF11]], undef [[DEF11]], implicit-def dead $scc - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE1:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.16, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.16, align 4, addrspace 5) + ; GCN-NEXT: [[SI_SPILL_S256_RESTORE1:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.16, 0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.16, align 4, addrspace 5) ; GCN-NEXT: undef [[COPY5:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub0 { ; GCN-NEXT: internal [[COPY5]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub2 ; GCN-NEXT: internal [[COPY5]].sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub5 @@ -307,23 +307,23 @@ body: | ; GCN-NEXT: dead [[S_AND_B32_3:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_8]], undef [[V_CMP_GT_F32_e64_9]], implicit-def dead $scc ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_14:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub2, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY5]].sub5, [[COPY5]].sub7, implicit-def dead $scc - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE2:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.10, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.10, align 4, addrspace 5) - ; GCN-NEXT: undef [[COPY6:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16 + ; GCN-NEXT: [[SI_SPILL_S224_RESTORE:%[0-9]+]]:sgpr_224 = SI_SPILL_S224_RESTORE %stack.10, 0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.10, align 4, addrspace 5) + ; GCN-NEXT: undef [[COPY6:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S224_RESTORE]] ; GCN-NEXT: dead [[S_OR_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub0, [[COPY6]].sub1, implicit-def dead $scc ; GCN-NEXT: dead [[S_OR_B32_4:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub2, undef [[S_OR_B32_3]], implicit-def dead $scc - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE2:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.9, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.9, addrspace 5) - ; GCN-NEXT: dead [[S_AND_B32_4:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_3]], [[SI_SPILL_S32_RESTORE2]], implicit-def dead $scc + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE4:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.9, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.9, addrspace 5) + ; GCN-NEXT: dead [[S_AND_B32_4:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_3]], [[SI_SPILL_S32_RESTORE4]], implicit-def dead $scc ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_15:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub3, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_16:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub4, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_17:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub5, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_18:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub6, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE3:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.11, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.11, addrspace 5) - ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_19:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE3]], 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE3:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.13, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.13, align 4, addrspace 5) - ; GCN-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub0 { - ; GCN-NEXT: internal [[COPY7]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub2 - ; GCN-NEXT: internal [[COPY7]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub4 - ; GCN-NEXT: internal [[COPY7]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub7 + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE5:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.11, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.11, addrspace 5) + ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_19:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE5]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SI_SPILL_S256_RESTORE2:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.13, 0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.13, align 4, addrspace 5) + ; GCN-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub0 { + ; GCN-NEXT: internal [[COPY7]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub2 + ; GCN-NEXT: internal [[COPY7]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub4 + ; GCN-NEXT: internal [[COPY7]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub7 ; GCN-NEXT: } ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_20:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub0, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_21:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub2, 0, implicit $mode, implicit $exec @@ -348,7 +348,7 @@ body: | ; GCN-NEXT: dead [[S_OR_B32_5:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY8]].sub10, [[COPY8]].sub9, implicit-def dead $scc ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_27:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub13, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_28:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub14, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.12, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.12, align 4, addrspace 5) + ; GCN-NEXT: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.12, 0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.12, align 4, addrspace 5) ; GCN-NEXT: undef [[COPY9:%[0-9]+]].sub1:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub1 { ; GCN-NEXT: internal [[COPY9]].sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub5 ; GCN-NEXT: internal [[COPY9]].sub6:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub6 @@ -367,8 +367,8 @@ body: | ; GCN-NEXT: dead [[DEF15:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: dead [[S_AND_B32_7:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF15]], undef [[DEF14]], implicit-def dead $scc ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_33:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub12, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE4:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.6, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.6, align 4, addrspace 5) - ; GCN-NEXT: undef [[COPY10:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16 + ; GCN-NEXT: [[SI_SPILL_S224_RESTORE1:%[0-9]+]]:sgpr_224 = SI_SPILL_S224_RESTORE %stack.6, 0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.6, align 4, addrspace 5) + ; GCN-NEXT: undef [[COPY10:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S224_RESTORE1]] ; GCN-NEXT: dead [[S_OR_B32_6:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY10]].sub0, [[COPY9]].sub15, implicit-def dead $scc ; GCN-NEXT: dead [[DEF16:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_34:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub1, 0, implicit $mode, implicit $exec @@ -380,38 +380,38 @@ body: | ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_38:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub5, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_39:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub6, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[S_AND_B32_8:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF18]], undef [[DEF17]], implicit-def dead $scc - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE5:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.4, align 4, addrspace 5) - ; GCN-NEXT: undef [[COPY11:%[0-9]+]].sub0_sub1_sub2_sub3_sub4_sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE5]].sub0_sub1_sub2_sub3_sub4_sub5 + ; GCN-NEXT: [[SI_SPILL_S192_RESTORE:%[0-9]+]]:sgpr_192 = SI_SPILL_S192_RESTORE %stack.4, 0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.4, align 4, addrspace 5) + ; GCN-NEXT: undef [[COPY11:%[0-9]+]].sub0_sub1_sub2_sub3_sub4_sub5:sgpr_256 = COPY [[SI_SPILL_S192_RESTORE]] ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_40:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub0, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_41:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub1, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE4:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5) - ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_42:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE4]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE6:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.3, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5) + ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_42:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE6]], 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_43:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub2, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_44:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub3, 0, implicit $mode, implicit $exec ; GCN-NEXT: dead [[S_OR_B32_7:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY11]].sub4, [[COPY11]].sub5, implicit-def dead $scc - ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE4]], 0, implicit-def $scc - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE5:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.5, addrspace 5) - ; GCN-NEXT: dead [[S_AND_B32_9:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_7]], [[SI_SPILL_S32_RESTORE5]], implicit-def dead $scc + ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE6]], 0, implicit-def $scc + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE7:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.5, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.5, addrspace 5) + ; GCN-NEXT: dead [[S_AND_B32_9:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_7]], [[SI_SPILL_S32_RESTORE7]], implicit-def dead $scc ; GCN-NEXT: dead [[S_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY2]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1) - ; GCN-NEXT: [[SI_SPILL_S256_RESTORE6:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.8, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.8, align 4, addrspace 5) - ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S256_RESTORE6]].sub7, 0, implicit-def $scc - ; GCN-NEXT: [[SI_SPILL_S32_RESTORE6:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.7, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.7, addrspace 5) - ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_45:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE6]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE8:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.8, 7, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.8, align 4, addrspace 5) + ; GCN-NEXT: S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE8]], 0, implicit-def $scc + ; GCN-NEXT: [[SI_SPILL_S32_RESTORE9:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.7, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.7, addrspace 5) + ; GCN-NEXT: dead [[V_CMP_GT_F32_e64_45:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE9]], 0, implicit $mode, implicit $exec ; GCN-NEXT: [[DEF19:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: dead [[S_AND_B32_10:%[0-9]+]]:sreg_32 = S_AND_B32 [[DEF19]], undef [[S_LOAD_DWORD_IMM6]], implicit-def dead $scc ; GCN-NEXT: dead [[S_AND_B32_11:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_AND_B32_10]], [[S_MOV_B32_2]], implicit-def dead $scc ; GCN-NEXT: $vcc = COPY undef [[S_AND_B32_11]] - ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.12, implicit $vcc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.12, implicit $vcc_lo ; GCN-NEXT: S_BRANCH %bb.11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.11: ; GCN-NEXT: successors: %bb.12(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.12: - ; GCN-NEXT: [[SI_SPILL_S64_RESTORE3:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) - ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE3]], 0, 0, implicit $exec :: (store (s32), addrspace 1) - ; GCN-NEXT: [[SI_SPILL_S64_RESTORE4:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE4]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; GCN-NEXT: [[SI_SPILL_S64_RESTORE1:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; GCN-NEXT: [[SI_SPILL_S64_RESTORE2:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE2]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 4a0bb6ceccd3f..ed6653470913e 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -41,102 +41,86 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr36_sgpr37 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr38_sgpr39 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr50_sgpr51 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec - ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr52_sgpr53 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr52_sgpr53, implicit-def dead $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr43 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr44 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr45 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr46 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr47 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr48 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr49 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr51 = COPY killed renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr56 - ; CHECK-NEXT: renamable $sgpr53 = COPY killed renamable $sgpr76 - ; CHECK-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr52_sgpr53 - ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57 - ; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54 - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50 - ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 - ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr58 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr59 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr60 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr61 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr63 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr64 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr68 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr12_sgpr13 + ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr57 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr58 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr59 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr60 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr61 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr62 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr63 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec + ; CHECK-NEXT: renamable $sgpr65 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr66 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr69 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr70 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr71 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr72 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr73 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr74 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr75 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr76 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr77 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr78 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr79 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr80 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr81 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr82 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr84 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr86 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: renamable $sgpr9 = SI_SPILL_S32_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr87 = COPY killed renamable $sgpr9 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, implicit $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} @@ -158,30 +142,29 @@ body: | ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr12 = COPY killed renamable $sgpr60 ; CHECK-NEXT: $sgpr13 = COPY killed renamable $sgpr62 - ; CHECK-NEXT: $sgpr14 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $sgpr14 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_noregs, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr38_sgpr39, undef renamable $sgpr54_sgpr55, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.7(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec + ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr48_sgpr49, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec @@ -189,19 +172,19 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, 16, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr12_sgpr13, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec ; CHECK-NEXT: dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec ; CHECK-NEXT: renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 @@ -217,13 +200,11 @@ body: | ; CHECK-NEXT: renamable $sgpr33 = COPY killed renamable $sgpr16 ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15 ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14 - ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 - ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83 ; CHECK-NEXT: renamable $sgpr16 = COPY killed renamable $sgpr33 @@ -238,44 +219,42 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.10: ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.12: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.13: ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.14: ; CHECK-NEXT: successors: %bb.15(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.15: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.16: diff --git a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir index c7fa879187cc3..dfc5eea3ff9d4 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir @@ -17,7 +17,7 @@ body: | ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 2, implicit $m0 ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 3, implicit $m0 ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr2, implicit killed $sgpr1 - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 $m0 = IMPLICIT_DEF @@ -40,7 +40,7 @@ body: | ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 2, implicit $m0 ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr2, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 3, implicit $m0 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit killed $sgpr0, implicit killed $sgpr2 ; GCN-NEXT: renamable $sgpr0 = S_MUL_I32 renamable $sgpr5, 3 ; GCN-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 killed $sgpr0, implicit $exec @@ -67,7 +67,7 @@ body: | ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 3, implicit $m0 ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 2, implicit $m0 ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit killed $sgpr0_sgpr1, implicit killed $sgpr4_sgpr5 - ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: dead [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr4_sgpr5, implicit $exec ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr8, implicit renamable $sgpr11 %0:sreg_64 = IMPLICIT_DEF @@ -92,7 +92,7 @@ body: | ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 2, implicit $m0 ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64 3, implicit $m0 ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr4_sgpr5, implicit killed $sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: dead [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr2_sgpr3, implicit $exec ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 $m0 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir index 95eac12a65389..e0c0bb95e08f3 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir @@ -268,10 +268,10 @@ body: | ; GCN-NEXT: renamable $sgpr1 = S_GET_WAVEID_IN_WORKGROUP ; GCN-NEXT: renamable $sgpr0 = S_GET_WAVEID_IN_WORKGROUP ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3 %0:sreg_64_xexec = COPY $sgpr8_sgpr9 @@ -358,10 +358,10 @@ body: | ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM_ci renamable $sgpr2_sgpr3, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4) ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM_ci renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4) ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3 %0:sreg_64_xexec = COPY $sgpr8_sgpr9 @@ -475,10 +475,10 @@ body: | ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (dereferenceable load (s32), addrspace 4) ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable load (s32), addrspace 4) ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3 %0:sreg_64_xexec = COPY $sgpr8_sgpr9 @@ -507,11 +507,11 @@ body: | ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MEMTIME ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr0_sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.1, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1 %0:sreg_64_xexec = COPY $sgpr8_sgpr9 %1:sreg_64_xexec = S_MEMTIME @@ -539,11 +539,11 @@ body: | ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MEMREALTIME ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr0_sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.1, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1 %0:sreg_64_xexec = COPY $sgpr8_sgpr9 %1:sreg_64_xexec = S_MEMREALTIME @@ -570,9 +570,9 @@ body: | ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4) ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4) - ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3 ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 undef %0.sub0_sub1:sgpr_256 = COPY $sgpr8_sgpr9 diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index cb652db425421..005e5ad234bcc 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -36,10 +36,10 @@ body: | ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 2, implicit $exec ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 3, implicit $exec ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0 $exec = IMPLICIT_DEF @@ -99,9 +99,9 @@ body: | ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0 %0:sreg_32 = IMPLICIT_DEF @@ -618,23 +618,23 @@ body: | ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.5, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.5, addrspace 5) ; GCN-NEXT: renamable $sgpr0 = COPY killed renamable $sgpr1 ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.4, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.4, addrspace 5) - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.3, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.3, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.3, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADD_U32 killed renamable $sgpr1, killed renamable $sgpr0, implicit-def $scc - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.2, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.3, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.5, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.5, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.3, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.3, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.5, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.5, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADD_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.5, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.5, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.5, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.5, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADD_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc - ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.2, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, 0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc ; GCN-NEXT: S_ENDPGM 0 %0:sreg_64 = S_GETPC_B64_pseudo diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir index 8b87f5be52411..c28c2d61c04b9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -73,5 +73,5 @@ body: | DBG_VALUE %stack.0, 0, !1, !8, debug-location !9 bb.1: - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir index cdf2b41c1e5b4..4f80bb8e97544 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir @@ -49,5 +49,5 @@ body: | DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9 bb.1: - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir index 6238c46edd6d1..c31652a2a1373 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir @@ -53,5 +53,5 @@ body: | DBG_VALUE %fixed-stack.0, 0, !1, !8, debug-location !9 bb.1: - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 2107c7bd527fc..e8de673c14876 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -339,7 +339,7 @@ body: | KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 bb.2: - renamable $sgpr22 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + renamable $sgpr22 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) bb.3: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir index bb7a28c68d3c5..6f7077e90c86a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir @@ -150,7 +150,7 @@ body: | bb.0: liveins: S_CMP_EQ_U32 0, 0, implicit-def $scc - $sgpr8 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, align 4, addrspace 5) + $sgpr8 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, align 4, addrspace 5) S_CBRANCH_SCC1 %bb.2, implicit $scc bb.1: @@ -203,7 +203,7 @@ body: | bb.0: liveins: S_CMP_EQ_U32 0, 0, implicit-def $scc - $sgpr8_sgpr9 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) + $sgpr8_sgpr9 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) S_CBRANCH_SCC1 %bb.2, implicit $scc bb.1: @@ -1974,7 +1974,7 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 S_CMP_EQ_U32 0, 0, implicit-def $scc - $sgpr8 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32:: (load (s32) from %stack.1, align 4, addrspace 5) + $sgpr8 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32:: (load (s32) from %stack.1, align 4, addrspace 5) S_CBRANCH_SCC1 %bb.2, implicit $scc bb.1: @@ -3749,7 +3749,7 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 S_CMP_EQ_U32 0, 0, implicit-def $scc - $sgpr8_sgpr9 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32:: (load (s64) from %stack.1, align 4, addrspace 5) + $sgpr8_sgpr9 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32:: (load (s64) from %stack.1, align 4, addrspace 5) S_CBRANCH_SCC1 %bb.2, implicit $scc bb.1: @@ -5546,8 +5546,8 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 S_CMP_EQ_U32 0, 0, implicit-def $scc - $sgpr8 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32:: (load (s32) from %stack.1, align 4, addrspace 5) - $sgpr9 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32:: (load (s32) from %stack.2, align 4, addrspace 5) + $sgpr8 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32:: (load (s32) from %stack.1, align 4, addrspace 5) + $sgpr9 = SI_SPILL_S32_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr32:: (load (s32) from %stack.2, align 4, addrspace 5) S_CBRANCH_SCC1 %bb.2, implicit $scc bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir index a1fc683679f9d..0cfd9b5fc07dc 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir @@ -46,6 +46,6 @@ body: | ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $scc S_CMP_EQ_U32 0, 0, implicit-def $scc SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_SETPC_B64 $sgpr30_sgpr31, implicit $scc ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index 6202535561d46..166289b20e7c2 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -39,13 +39,13 @@ # SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5) # SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) # SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5) -# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0 -# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5) +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5) # SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) -# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0 -# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5) +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5) # NOSHARE: stack: # NOSHARE: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, @@ -64,14 +64,14 @@ # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.2, addrspace 5) # NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) # NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5) -# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit undef $vgpr0 -# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5) +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.2, addrspace 5) # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5) # NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) -# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu, implicit $vgpr0 -# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5) +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir index 92c4249b26069..a1b3aedcf1006 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir @@ -1035,21 +1035,21 @@ body: | ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0 ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5) ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0 - renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12_sgpr13_sgpr14 = SI_SPILL_S96_RESTORE %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12_sgpr13_sgpr14 = SI_SPILL_S96_RESTORE %stack.2, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S128_RESTORE %stack.3, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = SI_SPILL_S160_RESTORE %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = SI_SPILL_S160_RESTORE %stack.4, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.5, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.6, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.7, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.8, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.8, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir index bed7c0c12b7cb..c18f94576cc6a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-empty-prolog-block.mir @@ -25,7 +25,7 @@ body: | bb.0: liveins: $sgpr0 bb.1: - renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 + renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b5474b8974b29..ef193edb8636a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10150,41 +10150,35 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_or_b64 exec, exec, vcc -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 exec, 3 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80400 +; GFX6-NEXT: s_mov_b32 s2, 0x80400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s36, v4, 0 +; GFX6-NEXT: v_readlane_b32 s37, v4, 1 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1] -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 exec, 3 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80800 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s38, v4, 2 +; GFX6-NEXT: v_readlane_b32 s39, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] +; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b32 s0, 0x86a00 ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX6-NEXT: s_mov_b32 s0, 0x86600 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir index b0be5676e26a2..28474ee03573b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -28,7 +28,7 @@ body: | ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 S_NOP 0 SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_SETPC_B64 $sgpr30_sgpr31 ... @@ -161,8 +161,8 @@ body: | SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_NOP 0 - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_SETPC_B64 $sgpr30_sgpr31 ... @@ -230,7 +230,7 @@ body: | S_BRANCH %bb.3 bb.3: liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 ... @@ -297,12 +297,12 @@ body: | S_BRANCH %bb.3 bb.1: liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc S_BRANCH %bb.2 bb.2: liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc S_BRANCH %bb.3 bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir index 3531b3dd75792..ee41854399188 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir @@ -153,5 +153,5 @@ body: | $vcc = IMPLICIT_DEF SI_SPILL_S64_SAVE killed $vcc, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + $vcc = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 ... diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir index be2a31d7ccbaa..ae4f21b5ce30c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -30,7 +30,7 @@ body: | ; SPILLED-NEXT: S_NOP 1 ; SPILLED-NEXT: {{ $}} ; SPILLED-NEXT: bb.2: - ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 = SI_SPILL_S192_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s192) from %stack.0, align 4, addrspace 5) + ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 = SI_SPILL_S192_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s192) from %stack.0, align 4, addrspace 5) ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; ; EXPANDED-LABEL: name: spill_restore_sgpr192 diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir index f4965dcf61e17..2c7a7263a5272 100644 --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -26,7 +26,7 @@ body: | ; SPILLED-NEXT: S_NOP 1 ; SPILLED-NEXT: {{ $}} ; SPILLED-NEXT: bb.2: - ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 = SI_SPILL_S224_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s224) from %stack.0, align 4, addrspace 5) + ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 = SI_SPILL_S224_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s224) from %stack.0, align 4, addrspace 5) ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; ; EXPANDED-LABEL: name: spill_restore_sgpr224 diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir index 312531ba5bc99..50442f65a52ed 100644 --- a/llvm/test/CodeGen/AMDGPU/spill288.mir +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir @@ -26,7 +26,7 @@ body: | ; SPILLED-NEXT: S_NOP 1 ; SPILLED-NEXT: {{ $}} ; SPILLED-NEXT: bb.2: - ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = SI_SPILL_S288_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s288) from %stack.0, align 4, addrspace 5) + ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = SI_SPILL_S288_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s288) from %stack.0, align 4, addrspace 5) ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; ; EXPANDED-LABEL: name: spill_restore_sgpr288 diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir index 0c0c01760f8ba..8d7098d71439d 100644 --- a/llvm/test/CodeGen/AMDGPU/spill320.mir +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir @@ -26,7 +26,7 @@ body: | ; SPILLED-NEXT: S_NOP 1 ; SPILLED-NEXT: {{ $}} ; SPILLED-NEXT: bb.2: - ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = SI_SPILL_S320_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s320) from %stack.0, align 4, addrspace 5) + ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = SI_SPILL_S320_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s320) from %stack.0, align 4, addrspace 5) ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; ; EXPANDED-LABEL: name: spill_restore_sgpr320 diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir index 8823ba1a8326e..f4052a07ea7e1 100644 --- a/llvm/test/CodeGen/AMDGPU/spill352.mir +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir @@ -26,7 +26,7 @@ body: | ; SPILLED-NEXT: S_NOP 1 ; SPILLED-NEXT: {{ $}} ; SPILLED-NEXT: bb.2: - ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 = SI_SPILL_S352_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s352) from %stack.0, align 4, addrspace 5) + ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 = SI_SPILL_S352_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s352) from %stack.0, align 4, addrspace 5) ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; ; EXPANDED-LABEL: name: spill_restore_sgpr352 diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir index e33641cf89237..c4dfa4ec618c3 100644 --- a/llvm/test/CodeGen/AMDGPU/spill384.mir +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir @@ -26,7 +26,7 @@ body: | ; SPILLED-NEXT: S_NOP 1 ; SPILLED-NEXT: {{ $}} ; SPILLED-NEXT: bb.2: - ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S384_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s384) from %stack.0, align 4, addrspace 5) + ; SPILLED-NEXT: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S384_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s384) from %stack.0, align 4, addrspace 5) ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; ; EXPANDED-LABEL: name: spill_restore_sgpr384 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir index 359152e9d2b45..f86c20af2dffd 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -24,8 +24,8 @@ body: | ; RA-NEXT: bb.1: ; RA-NEXT: successors: %bb.2(0x80000000) ; RA-NEXT: {{ $}} - ; RA-NEXT: [[SI_SPILL_S1024_RESTORE:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; RA-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE]].sub0_sub1 + ; RA-NEXT: [[SI_SPILL_S64_RESTORE:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; RA-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:sgpr_1024 = COPY [[SI_SPILL_S64_RESTORE]] ; RA-NEXT: [[COPY:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY]].sub0 ; RA-NEXT: [[COPY:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY]].sub1 ; RA-NEXT: [[COPY:%[0-9]+]].sub4:sgpr_1024 = COPY [[COPY]].sub0 @@ -55,8 +55,8 @@ body: | ; RA-NEXT: [[COPY:%[0-9]+]].sub28:sgpr_1024 = COPY [[COPY]].sub0 ; RA-NEXT: [[COPY:%[0-9]+]].sub29:sgpr_1024 = COPY [[COPY]].sub1 ; RA-NEXT: SI_SPILL_S1024_SAVE [[COPY]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) - ; RA-NEXT: [[SI_SPILL_S1024_RESTORE1:%[0-9]+]]:sgpr_1024 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; RA-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_1024 = COPY [[SI_SPILL_S1024_RESTORE1]].sub0 + ; RA-NEXT: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; RA-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_1024 = COPY [[SI_SPILL_S32_RESTORE]] ; RA-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_1024 = COPY [[COPY1]].sub0 ; RA-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_1024 = COPY [[COPY1]].sub0 ; RA-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_1024 = COPY [[COPY1]].sub0 @@ -113,7 +113,8 @@ body: | ; VR-NEXT: successors: %bb.2(0x80000000) ; VR-NEXT: liveins: $sgpr30_sgpr31, $sgpr34_sgpr35 ; VR-NEXT: {{ $}} - ; VR-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr36_sgpr37 = COPY killed renamable $sgpr4_sgpr5 ; VR-NEXT: renamable $sgpr38 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr39 = COPY renamable $sgpr37 ; VR-NEXT: renamable $sgpr40 = COPY renamable $sgpr36 @@ -143,7 +144,8 @@ body: | ; VR-NEXT: renamable $sgpr64 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr65 = COPY renamable $sgpr37 ; VR-NEXT: SI_SPILL_S1024_SAVE killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.0, align 4, addrspace 5) - ; VR-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.1, 0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr36 = COPY killed renamable $sgpr4 ; VR-NEXT: renamable $sgpr37 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr38 = COPY renamable $sgpr36 ; VR-NEXT: renamable $sgpr39 = COPY renamable $sgpr36 @@ -295,7 +297,7 @@ body: | ; RA-NEXT: } ; RA-NEXT: SI_SPILL_S512_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5) ; RA-NEXT: S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98 - ; RA-NEXT: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) + ; RA-NEXT: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) ; RA-NEXT: undef [[COPY2:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub4_sub5 { ; RA-NEXT: internal [[COPY2]].sub10_sub11:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10_sub11 ; RA-NEXT: internal [[COPY2]].sub7:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub7 @@ -332,7 +334,7 @@ body: | ; VR-NEXT: renamable $sgpr26 = S_MOV_B32 -1 ; VR-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5) ; VR-NEXT: S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98 - ; VR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) + ; VR-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5) ; VR-NEXT: renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr16_sgpr17 ; VR-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr19 ; VR-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr22_sgpr23 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir index 42db92b15acf5..d35a5178d4b20 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir @@ -312,9 +312,9 @@ body: | ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY67]], [[S_MOV_B32_]], 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY68:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 { - ; CHECK-NEXT: internal [[COPY68]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY68:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE]].sub0 { + ; CHECK-NEXT: internal [[COPY68]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY69:%[0-9]+]].sub0:vreg_128 = COPY [[COPY68]].sub0 { ; CHECK-NEXT: internal [[COPY69]].sub2:vreg_128 = COPY [[COPY68]].sub2 @@ -331,9 +331,9 @@ body: | ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY71]], [[S_MOV_B32_]], 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY72:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 { - ; CHECK-NEXT: internal [[COPY72]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE1:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY72:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE1]].sub0 { + ; CHECK-NEXT: internal [[COPY72]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE1]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY73:%[0-9]+]].sub0:vreg_128 = COPY [[COPY72]].sub0 { ; CHECK-NEXT: internal [[COPY73]].sub2:vreg_128 = COPY [[COPY72]].sub2 @@ -341,9 +341,9 @@ body: | ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY73]], [[S_MOV_B32_]], 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY74:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 { - ; CHECK-NEXT: internal [[COPY74]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE2:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY74:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE2]].sub0 { + ; CHECK-NEXT: internal [[COPY74]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE2]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY75:%[0-9]+]].sub0:vreg_128 = COPY [[COPY74]].sub0 { ; CHECK-NEXT: internal [[COPY75]].sub2:vreg_128 = COPY [[COPY74]].sub2 @@ -351,9 +351,9 @@ body: | ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY75]], [[S_MOV_B32_]], 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY76:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 { - ; CHECK-NEXT: internal [[COPY76]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE3:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY76:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE3]].sub0 { + ; CHECK-NEXT: internal [[COPY76]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE3]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY77:%[0-9]+]].sub0:vreg_128 = COPY [[COPY76]].sub0 { ; CHECK-NEXT: internal [[COPY77]].sub2:vreg_128 = COPY [[COPY76]].sub2 @@ -379,9 +379,9 @@ body: | ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY81]], [[S_MOV_B32_]], 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY82:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 { - ; CHECK-NEXT: internal [[COPY82]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE4:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY82:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE4]].sub0 { + ; CHECK-NEXT: internal [[COPY82]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE4]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY83:%[0-9]+]].sub0:vreg_128 = COPY [[COPY82]].sub0 { ; CHECK-NEXT: internal [[COPY83]].sub2:vreg_128 = COPY [[COPY82]].sub2 @@ -389,9 +389,9 @@ body: | ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY83]], [[S_MOV_B32_]], 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY84:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 { - ; CHECK-NEXT: internal [[COPY84]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE5:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY84:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE5]].sub0 { + ; CHECK-NEXT: internal [[COPY84]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE5]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY85:%[0-9]+]].sub0:vreg_128 = COPY [[COPY84]].sub0 { ; CHECK-NEXT: internal [[COPY85]].sub2:vreg_128 = COPY [[COPY84]].sub2 @@ -399,9 +399,9 @@ body: | ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY85]], [[S_MOV_B32_]], 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY86:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 { - ; CHECK-NEXT: internal [[COPY86]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE6:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY86:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE6]].sub0 { + ; CHECK-NEXT: internal [[COPY86]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE6]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY87:%[0-9]+]].sub0:vreg_128 = COPY [[COPY86]].sub0 { ; CHECK-NEXT: internal [[COPY87]].sub2:vreg_128 = COPY [[COPY86]].sub2 @@ -409,9 +409,9 @@ body: | ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY87]], [[S_MOV_B32_]], 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY88:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 { - ; CHECK-NEXT: internal [[COPY88]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE7:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY88:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE7]].sub0 { + ; CHECK-NEXT: internal [[COPY88]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE7]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY89:%[0-9]+]].sub0:vreg_128 = COPY [[COPY88]].sub0 { ; CHECK-NEXT: internal [[COPY89]].sub2:vreg_128 = COPY [[COPY88]].sub2 @@ -419,9 +419,9 @@ body: | ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY89]], [[S_MOV_B32_]], 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY90:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 { - ; CHECK-NEXT: internal [[COPY90]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE8:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY90:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE8]].sub0 { + ; CHECK-NEXT: internal [[COPY90]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE8]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY91:%[0-9]+]].sub0:vreg_128 = COPY [[COPY90]].sub0 { ; CHECK-NEXT: internal [[COPY91]].sub2:vreg_128 = COPY [[COPY90]].sub2 @@ -429,9 +429,9 @@ body: | ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY91]], [[S_MOV_B32_]], 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY92:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 { - ; CHECK-NEXT: internal [[COPY92]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE9:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY92:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE9]].sub0 { + ; CHECK-NEXT: internal [[COPY92]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE9]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY93:%[0-9]+]].sub0:vreg_128 = COPY [[COPY92]].sub0 { ; CHECK-NEXT: internal [[COPY93]].sub2:vreg_128 = COPY [[COPY92]].sub2 @@ -439,9 +439,9 @@ body: | ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY93]], [[S_MOV_B32_]], 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY94:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 { - ; CHECK-NEXT: internal [[COPY94]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE10:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY94:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE10]].sub0 { + ; CHECK-NEXT: internal [[COPY94]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE10]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY95:%[0-9]+]].sub0:vreg_128 = COPY [[COPY94]].sub0 { ; CHECK-NEXT: internal [[COPY95]].sub2:vreg_128 = COPY [[COPY94]].sub2 @@ -449,9 +449,9 @@ body: | ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY95]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1) - ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: undef [[COPY96:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 { - ; CHECK-NEXT: internal [[COPY96]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2 + ; CHECK-NEXT: [[SI_SPILL_V96_RESTORE11:%[0-9]+]]:vreg_96 = SI_SPILL_V96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: undef [[COPY96:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V96_RESTORE11]].sub0 { + ; CHECK-NEXT: internal [[COPY96]].sub2:vreg_128 = COPY [[SI_SPILL_V96_RESTORE11]].sub2 ; CHECK-NEXT: } ; CHECK-NEXT: undef [[COPY97:%[0-9]+]].sub0:vreg_128 = COPY [[COPY96]].sub0 { ; CHECK-NEXT: internal [[COPY97]].sub2:vreg_128 = COPY [[COPY96]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir b/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir index f50f2e98d4bfa..5292b528f3db4 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir @@ -27,8 +27,8 @@ body: | ; CHECK-NEXT: renamable $sgpr1 = IMPLICIT_DEF ; CHECK-NEXT: SI_SPILL_S64_SAVE renamable $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 - ; CHECK-NEXT: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: $sgpr105 = S_AND_B32 killed renamable $sgpr1, renamable $sgpr1, implicit-def $scc + ; CHECK-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, 1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: $sgpr105 = S_AND_B32 killed renamable $sgpr0, renamable $sgpr0, implicit-def $scc ; CHECK-NEXT: S_NOP 0, implicit $sgpr104, implicit $sgpr105 %0:sreg_64 = COPY $sgpr0_sgpr1 %0.sub0:sreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/splitkit.mir b/llvm/test/CodeGen/AMDGPU/splitkit.mir index 3065fce538157..5e0fee65ca258 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit.mir @@ -18,7 +18,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr3 ; CHECK-NEXT: SI_SPILL_S128_SAVE renamable $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit-def dead $sgpr0, implicit-def dead $sgpr1, implicit-def dead $sgpr2, implicit-def dead $sgpr3, implicit-def dead $sgpr4, implicit-def dead $sgpr5, implicit-def dead $sgpr6, implicit-def dead $sgpr7, implicit-def dead $sgpr8, implicit-def dead $sgpr9, implicit-def dead $sgpr10, implicit-def dead $sgpr11 - ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit renamable $sgpr0 ; CHECK-NEXT: S_NOP 0, implicit renamable $sgpr3 ; CHECK-NEXT: S_NOP 0, implicit renamable $sgpr0 @@ -90,7 +90,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit renamable $sgpr0 ; CHECK-NEXT: S_NOP 0, implicit-def dead $sgpr0, implicit-def dead $sgpr1, implicit-def dead $sgpr2, implicit-def dead $sgpr3, implicit-def dead $sgpr4, implicit-def dead $sgpr5, implicit-def dead $sgpr6, implicit-def dead $sgpr7, implicit-def dead $sgpr8, implicit-def dead $sgpr9, implicit-def dead $sgpr10, implicit-def dead $sgpr11 - ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} @@ -99,7 +99,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr0_sgpr1_sgpr2_sgpr3:0x00000000000000C3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def dead $sgpr0, implicit-def dead $sgpr1, implicit-def dead $sgpr2, implicit-def dead $sgpr3, implicit-def dead $sgpr4, implicit-def dead $sgpr5, implicit-def dead $sgpr6, implicit-def dead $sgpr7, implicit-def dead $sgpr8, implicit-def dead $sgpr9, implicit-def dead $sgpr10, implicit-def dead $sgpr11 - ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.0, 0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir index a7a241e3faea8..407e1a9db97ef 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -10,7 +10,7 @@ # CHECK-NEXT: stack-id: sgpr-spill, # CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) -# CHECK: renamable $sgpr5 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) +# CHECK: renamable $sgpr5 = SI_SPILL_S32_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir index bc8a383a285b2..099d100142aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir +++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir @@ -34,20 +34,16 @@ body: | ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.2, $sgpr32, 8, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec - ; CHECK-NEXT: SI_SPILL_AV32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr7 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, $vgpr7, 0, killed $vgpr3, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: renamable $vgpr6 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr6_vgpr7 - ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, $vgpr4, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr2 = SI_SPILL_AV32_RESTORE %stack.2, $sgpr32, 12, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr3 = nofpexcept V_DIV_FIXUP_F32_e64 0, $vgpr1, 0, $vgpr7, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2 = SI_SPILL_AV32_RESTORE %stack.2, $sgpr32, 8, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr2 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr6_vgpr7 = SI_SPILL_AV64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr1, 0, $vgpr4, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5) ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir index 0df2e651a15e1..fb0b239f94d64 100644 --- a/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir @@ -76,7 +76,7 @@ body: | ; CHECK-NEXT: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) ; CHECK-NEXT: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) ; CHECK-NEXT: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128)) - ; CHECK-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S128_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUFFER_LOAD_FORMAT_XYZW_IDXEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; CHECK-NEXT: KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7 ; CHECK-NEXT: KILL killed renamable $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir index b3f66b91ece82..916c72d4aa256 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir @@ -68,7 +68,7 @@ body: | successors: liveins: $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr13 - $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr13, implicit-def dead $m0 :: (load (s64) from %stack.0, align 4, addrspace 5) + $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr13, implicit-def dead $m0 :: (load (s64) from %stack.0, align 4, addrspace 5) $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc bb.3: @@ -133,7 +133,7 @@ body: | successors: liveins: $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr13 - $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr13, implicit-def dead $m0 :: (load (s64) from %stack.0, align 4, addrspace 5) + $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr13, implicit-def dead $m0 :: (load (s64) from %stack.0, align 4, addrspace 5) $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir b/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir index 0f0dd2374156f..b9ff6a2c32e7f 100644 --- a/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir +++ b/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir @@ -20,7 +20,7 @@ body: | ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY $vcc ; CHECK-NEXT: SI_SPILL_S64_SAVE $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY $vcc - ; CHECK-NEXT: $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: $vcc = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, 3, killed $sgpr4_sgpr5, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit killed $vgpr0, implicit killed renamable $vcc %0:vgpr_32 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll b/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll index ea127323f3e05..4c11050b91e8d 100644 --- a/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll +++ b/llvm/test/CodeGen/AMDGPU/use-after-free-after-cleanup-failed-vreg.ll @@ -1,9 +1,92 @@ -; RUN: not llc -mcpu=gfx1100 -mtriple=amdgcn-amd-amdhsa -stress-regalloc=4 -filetype=null -verify-machineinstrs %s 2>&1 | FileCheck %s - -; CHECK: error: :0:0: ran out of registers during register allocation in function 'f' -; CHECK-NOT: Bad machine code +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +;RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-amd-amdhsa -stress-regalloc=4 -verify-machineinstrs < %s | FileCheck %s define <16 x half> @f(i1 %LGV2, <16 x half> %0) { +; CHECK-LABEL: f: +; CHECK: ; %bb.0: ; %BB +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: s_clause 0x6 ; 28-byte Folded Spill +; CHECK-NEXT: scratch_store_b32 off, v8, s32 offset:100 +; CHECK-NEXT: scratch_store_b32 off, v7, s32 offset:156 +; CHECK-NEXT: scratch_store_b32 off, v6, s32 offset:152 +; CHECK-NEXT: scratch_store_b32 off, v5, s32 offset:148 +; CHECK-NEXT: scratch_store_b32 off, v4, s32 offset:144 +; CHECK-NEXT: scratch_store_b32 off, v3, s32 offset:140 +; CHECK-NEXT: scratch_store_b32 off, v2, s32 offset:136 +; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_and_saveexec_b32 s1, s0 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; kill: killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %SW_C +; CHECK-NEXT: v_pk_mul_f16 v0, v1, 0 +; CHECK-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:100 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v7, v0, 0 +; CHECK-NEXT: s_clause 0x1 ; 32-byte Folded Spill +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s32 offset:4 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s32 offset:20 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:156 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v6, v0, 0 +; CHECK-NEXT: s_clause 0x1 ; 32-byte Folded Spill +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s32 offset:36 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s32 offset:52 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:152 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v5, v0, 0 +; CHECK-NEXT: s_clause 0x1 ; 32-byte Folded Spill +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s32 offset:68 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s32 offset:84 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:148 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v4, v0, 0 +; CHECK-NEXT: s_clause 0x1 ; 32-byte Folded Spill +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s32 offset:104 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s32 offset:120 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:144 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v5, v0, 0 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:140 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v2, v0, 0 +; CHECK-NEXT: s_clause 0x1 ; 32-byte Folded Spill +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s32 offset:160 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s32 offset:176 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:136 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_pk_mul_f16 v1, v0, 0 +; CHECK-NEXT: s_clause 0x1 ; 32-byte Folded Spill +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s32 offset:192 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s32 offset:208 +; CHECK-NEXT: .LBB0_2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: s_clause 0x6 ; 28-byte Folded Reload +; CHECK-NEXT: scratch_load_b32 v4, off, s32 offset:120 +; CHECK-NEXT: scratch_load_b32 v5, off, s32 offset:88 +; CHECK-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; CHECK-NEXT: scratch_load_b32 v7, off, s32 offset:32 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 +; CHECK-NEXT: scratch_load_b32 v1, off, s32 offset:196 +; CHECK-NEXT: scratch_load_b32 v2, off, s32 offset:168 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] BB: br i1 %LGV2, label %SW_C3, label %SW_C diff --git a/llvm/test/CodeGen/AMDGPU/wwm-spill-superclass-pseudo.mir b/llvm/test/CodeGen/AMDGPU/wwm-spill-superclass-pseudo.mir index bdf9d88bbd8bb..c1223cc8e99d9 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-spill-superclass-pseudo.mir +++ b/llvm/test/CodeGen/AMDGPU/wwm-spill-superclass-pseudo.mir @@ -74,7 +74,7 @@ body: | bb.7: liveins: $sgpr60_sgpr61 $exec = S_OR_B64 $exec, killed $sgpr60_sgpr61, implicit-def $scc - $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 + $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, 0, implicit $exec, implicit $sgpr32 S_BRANCH %bb.5 bb.8: